0
|
1 # Description: Processing of transposon instertion sequences.
|
|
2 # Requirements perl DBI read write accessible database
|
|
3 # Requires a file labeled barcode.txt, and a tab delimited file contatining seqeunces.
|
|
4 # Should work on files of less than 40 million sequences.
|
|
5 # Identifies and removes barcodes, IRDR sequences and linker sequences from sequencing data, then compiles a list of non redundant sequences.
|
|
6 # This scripts conducts the PAPS steps 1
|
|
7 # Aaron Lyman Sarver July 12 2010
|
|
8 # version 3.1 December 1 2010
|
|
9
|
|
10 use File::Copy;
|
|
11 require "config.pl";
|
|
12
|
|
13 my $path = $0;
|
|
14 $path =~ s/\/\w*\.pl$//g;
|
|
15 print $path . "\n";
|
|
16 require "$path/util.pl";
|
|
17 require "$path/project_man.pl";
|
|
18
|
|
19 my $chrom_map_ref = &get_chrom_map();
|
|
20
|
|
21 print join("\t", keys %{$chrom_map_ref}) . "\n";
|
|
22
|
|
23 unless (-d "results") {
|
|
24 mkdir("results");
|
|
25 }
|
|
26
|
|
27 #Remove old data if it exists
|
|
28 $sth = $dbh->prepare("drop table if exists illumina_raw_$proj");
|
|
29 $sth->execute;
|
|
30 $sth = $dbh->prepare("drop table if exists barcode_$proj");
|
|
31 $sth->execute;
|
|
32 $sth = $dbh->prepare("drop table if exists illumina_decoded_$proj");
|
|
33 $sth->execute;
|
|
34 $sth = $dbh->prepare("drop table if exists illumina_without_IRDR_$proj");
|
|
35 $sth->execute;
|
|
36 $sth = $dbh->prepare("drop table if exists illumina_nr_pre_map_$proj");
|
|
37 $sth->execute;
|
|
38 $sth = $dbh->prepare("drop table if exists illumina4dec_$proj");
|
|
39 $sth->execute;
|
|
40 $sth = $dbh->prepare("drop table if exists illumina4dec2_$proj");
|
|
41 $sth->execute;
|
|
42 $sth = $dbh->prepare("drop table if exists illumina_blastout_$proj");
|
|
43 $sth->execute;
|
|
44
|
|
45 #1.1 Part 1 load database
|
|
46 print "Started PROCESSING\n Tables identified by the postfix $proj in the database";
|
|
47 #Determine number of columns in seqs.tab
|
|
48 open(SEQS, "<", "data/seqs.tab");
|
|
49 my @cols = split("\t", <SEQS>);
|
|
50 close(SEQS);
|
|
51 print join(":", @cols);
|
|
52 print "\n";
|
|
53 if ($#cols <= 2) {
|
|
54 &add_default_quality();
|
|
55 }
|
|
56 $sth = $dbh->prepare("create table illumina_raw_$proj ( id varchar(20), description varchar(20), sequence varchar(100), quality varchar(100) ) ");
|
|
57 $sth->execute;
|
|
58
|
|
59 print "generated illumina_raw table\n";
|
|
60
|
|
61 $sth = $dbh->prepare("load DATA local INFILE 'data/seqs.tab' INTO TABLE illumina_raw_$proj ");
|
|
62 $sth->execute;
|
|
63
|
|
64 print "1.1 complete - Loaded illumina_raw &r/m data/seqS.tab file \n";
|
|
65
|
|
66 #1.2
|
|
67 $sth = $dbh->prepare("create table barcode_$proj ( seq varchar(20), library varchar(30), direction varchar(20) )");
|
|
68 $sth->execute;
|
|
69
|
|
70 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
|
|
71
|
|
72 $sth = $dbh->prepare("load DATA local INFILE 'data/barcode2lib.txt' INTO TABLE barcode_$proj");
|
|
73 $sth->execute;
|
|
74
|
|
75 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
|
|
76
|
|
77 #1.3
|
|
78 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
|
|
79 $sth->execute;
|
|
80
|
|
81 while ((@row) = $sth->fetchrow_array) {
|
|
82 print $row[0]." Number of lines in raw data file\n";
|
|
83 }
|
|
84
|
|
85 $sth = $dbh->prepare("select count(distinct id) from illumina_raw_$proj");
|
|
86 $sth->execute;
|
|
87
|
|
88 while ((@row) = $sth->fetchrow_array) {
|
|
89 print $row[0]." Number of unique ids in illumina raw\n";
|
|
90 }
|
|
91 #1.4
|
|
92 $sth = $dbh->prepare("select count(*) from barcode_$proj");
|
|
93 $sth->execute;
|
|
94
|
|
95 while ((@row) = $sth->fetchrow_array) {
|
|
96 print $row[0]." Number of lines in barcode \n";
|
|
97 }
|
|
98
|
|
99 #2.1 map the barcodes to the sequences..migrated to functionin config.pl.
|
|
100 my $barcode_length = 0;
|
|
101 open (BARCODE, "<", "data/barcode2lib.txt") || die "Unable to open barcode to library file. $!\n";
|
|
102 while (<BARCODE>) {
|
|
103 chomp;
|
|
104 my @split = split("\t", $_);
|
|
105 if (length($split[0]) > $barcode_length) { $barcode_length = length($split[0]); }
|
|
106 }
|
|
107 close(BARCODE);
|
|
108
|
|
109 $sth = $dbh->prepare("create table illumina_decoded_$proj select library, id, substring(sequence,$barcode_length+1) as decoded_sequence, substring(quality, $barcode_length+1) as decoded_quality from barcode_$proj,illumina_raw_$proj where sequence like concat(seq,'%')");
|
|
110 $sth->execute;
|
|
111 print "Mapped barcodes to sequences in table illumina_decoded2\n";
|
|
112 print "2.1 complete - Mapped barcodes to sequences in table illumina_decoded\n";
|
|
113
|
|
114 #2.2
|
|
115 $sth = $dbh->prepare("select distinct library, count(id) from illumina_decoded_$proj group by library;");
|
|
116 $sth->execute;
|
|
117
|
|
118 while ((@row) = $sth->fetchrow_array) {
|
|
119 print $row[0]."\t".$row[1]." inserts\n";
|
|
120 }
|
|
121
|
|
122 #2.3
|
|
123 if (!defined($mutagens)) {
|
|
124 #454
|
|
125 $mutagens="__GTATGTAAACTTCCGACTTCAACTG";
|
|
126 #illumina data
|
|
127 #$mutagens = "TGTATGTAAACTTCCGACTTCAACTG";
|
|
128 #illumina_2 data
|
|
129 #$mutagens = "___TGTATGTAAACTTCCGACTTCAACTG";
|
|
130 #MULV
|
|
131 #$mutagens = "CCAAACCTACAGGTGGGGTCTTTCA";
|
|
132 }
|
|
133
|
|
134 my @mutagens_array = split(",", $mutagens);
|
|
135 my $first = 1;
|
|
136
|
|
137 foreach my $mutagen (@mutagens_array) {
|
|
138 $mutagen =~ s/^\s+//;
|
|
139 $mutagen =~ s/\s+$//;
|
|
140 if ($first) {
|
|
141 $sql_str = "create table ";
|
|
142 $first = 0;
|
|
143 } else {
|
|
144 $sql_str = "insert into ";
|
|
145 }
|
|
146 $sql_str = join("", $sql_str, "illumina_without_IRDR_$proj select library,id,substring(decoded_sequence,", length($mutagen)+1, ") as insertion_sequence, substring(decoded_quality, ", length($mutagen)+1, ") as insertion_quality, 'good' as type from illumina_decoded_$proj where decoded_sequence like '", $mutagen, "%'");
|
|
147 $sth = $dbh->prepare($sql_str);
|
|
148 $sth->execute;
|
|
149 }
|
|
150 print "2.3 - Mapped and removed IRDR sequences, created table illumina_without_IRDR\n";
|
|
151
|
|
152 # 3.1 Find and Remove internal linker
|
|
153
|
|
154 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idR' where insertion_sequence like '%GTCCCTTAAGCGGAGCC%'");
|
|
155 $sth->execute;
|
|
156 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'GTCCCTTAAGCGGAGCC',1) where type = 'idR';");
|
|
157 print "3.1 - Identified RIght linker in remaining seqeunces and renamed to idR.\n";
|
|
158
|
|
159 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idL' where insertion_sequence like '%TAGTCCCTTAAGCGGAG%'");
|
|
160 $sth->execute;
|
|
161 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'TAGTCCCTTAAGCGGAG',1) where type = 'idL';");
|
|
162 $sth->execute;
|
|
163 print "3.2 - Identified Left linker in remaining seqeunces and renamed to idL.\n";
|
|
164
|
|
165 # Part 3.3 Find and Remove GGATCC sequences
|
|
166 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'bam' where insertion_sequence like '_____GGATCC%'");
|
|
167 $sth->execute;
|
|
168 print "3.3 - Identified seqeunce with GGATCC and renamed.\n";
|
|
169
|
|
170 # Part 3.4 Find and remove any remaining seqeunces that do not start with TA seqeunce
|
|
171 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'noTA' where insertion_sequence not like 'TA%'");
|
|
172 $sth->execute;
|
|
173 print "3.4 - Identified seqeunce without TA and renamed to noTA\n";
|
|
174
|
|
175 # 3.5 Find and identify any sequences that are two short for meaningful mapping
|
|
176 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'min' where (type = 'good' or type = 'idR' or type = 'idL') and char_length(insertion_sequence)<24");
|
|
177 $sth->execute;
|
|
178 print "3.5 Identified short seqeunces and label them min\n";
|
|
179
|
|
180 #4.1
|
|
181 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (type)");
|
|
182 $sth->execute;
|
|
183
|
|
184 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_sequence)");
|
|
185 $sth->execute;
|
|
186
|
|
187 #if ($quality) {
|
|
188 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_quality)");
|
|
189 $sth->execute;
|
|
190 print "indexed table illumina_without_IRDR\n";
|
|
191 #}
|
|
192
|
|
193 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (library)");
|
|
194 $sth->execute;
|
|
195 print "4.1 - indexed table illumina_without_IRDR\n";
|
|
196
|
|
197 ####
|
|
198 #4.2 Remove duplicate sequences maintaining count of all
|
|
199 ####
|
|
200 $sth = $dbh->prepare("create table illumina_nr_pre_map_$proj as select distinct count(type) as number, insertion_sequence, insertion_quality, library from illumina_without_IRDR_$proj where (type = 'good' or type = 'idR' or type = 'idL') group by library, insertion_sequence, insertion_quality");
|
|
201 $sth->execute;
|
|
202
|
|
203 print "4.2 - Created illumina_nr_pre_map table\n";
|
|
204
|
|
205 #4.3
|
|
206 $sth = $dbh->prepare("create table illumina4dec_$proj (id INT unsigned not null auto_increment, primary key(ID), number varchar(20), library varchar(30),sequence varchar(100), quality varchar(100) )");
|
|
207 $sth->execute;
|
|
208 print "4.3 - Created illumina4dec table\n";
|
|
209
|
|
210 #if ($quality) {
|
|
211 $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence, insertion_quality from illumina_nr_pre_map_$proj");
|
|
212 #}
|
|
213 #else {
|
|
214 # $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence from illumina_nr_pre_map_$proj");
|
|
215 #}
|
|
216 $sth->execute;
|
|
217
|
|
218 print "4.4 - populated nr_id table\n";
|
|
219
|
|
220
|
|
221 $sth = $dbh->prepare("create table illumina4dec2_$proj select * from illumina4dec_$proj");
|
|
222 $sth->execute;
|
|
223 print "4.5 - Copied illumina4dec table to illumina4dec2 for later use\n";
|
|
224
|
|
225 ####
|
|
226 # Bowtie variable definitions
|
|
227 ####
|
|
228 if (length($bowtie_exe) == 0) { $bowtie_exe = "/project/ccbioinf/Projects/labs/largaespada/Illumina_data/bowtie-0.12.5/bowtie"; }
|
|
229 if (length($bowtie_idx) == 0) { $bowtie_idx = "mm9"; }
|
|
230 if (length($envDir) == 0) { $envDir = "."; }
|
|
231
|
|
232 ####
|
|
233 #5 This section of the script runs the bowtie algorythm.
|
|
234 ####
|
|
235 &alignment();
|
|
236 print "5.1c - FINISHED Bowtie iteration #1\n";
|
|
237
|
|
238 $sth = $dbh->prepare("drop table if exists bowtie_$proj ");
|
|
239 $sth->execute;
|
|
240 $sth = $dbh->prepare("drop table if exists bowtie_lib_$proj ");
|
|
241 $sth->execute;
|
|
242
|
|
243 #####
|
|
244 # 6 reduce to NR set and load library and location mapping.
|
|
245 ######
|
|
246 $sth = $dbh->prepare("create table bowtie_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,':',-2),':',1) as library, substring_index(substring_index(B.id,':',-1),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B");
|
|
247 $sth->execute;
|
|
248
|
|
249 $sth = $dbh->prepare("alter table bowtie_$proj add pos varchar(20)");
|
|
250 $sth->execute;
|
|
251 $sth = $dbh->prepare("alter table bowtie_$proj add stop varchar(20)");
|
|
252 $sth->execute;
|
|
253
|
|
254 $sth = $dbh->prepare("update bowtie_$proj set pos = size + start where strand = '-'");
|
|
255 $sth->execute;
|
|
256
|
|
257 $sth = $dbh->prepare("update bowtie_$proj set pos = start + 1 where strand = '+'");
|
|
258 $sth->execute;
|
|
259
|
|
260 $sth = $dbh->prepare("update bowtie_$proj set stop = start + size");
|
|
261 $sth->execute;
|
|
262
|
|
263 print "6.1 6.2 6.3 6.4 - created bowtie table and adjusted \n";
|
|
264 ###
|
|
265 #6.5 determine transposon orientation (is it in the A or B orientation?)
|
|
266 ###
|
|
267
|
|
268 $sth = $dbh->prepare("alter table bowtie_$proj add orient varchar(5) default 'U'");
|
|
269 $sth->execute;
|
|
270
|
|
271 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-L' and strand = '+'");
|
|
272 $sth->execute;
|
|
273
|
|
274 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-L' and strand = '-'");
|
|
275 $sth->execute;
|
|
276
|
|
277 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-R' and strand = '-'");
|
|
278 $sth->execute;
|
|
279
|
|
280 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-R' and strand = '+'");
|
|
281 $sth->execute;
|
|
282
|
|
283
|
|
284
|
|
285 print "6.5 - determined mapping orientation\n";
|
|
286
|
|
287 #6.6
|
|
288 $sth = $dbh->prepare("select count(*) from temp_$proj");
|
|
289 $sth->execute;
|
|
290
|
|
291 while ((@row) = $sth->fetchrow_array) {
|
|
292 print "6.6 - ".$row[0]." Number of unique seqeunces that mapped\n";
|
|
293 }
|
|
294 #6.7
|
|
295 $sth = $dbh->prepare("select count(distinct chromo, start) from temp_$proj order by chromo, start+0; ");
|
|
296 $sth->execute;
|
|
297
|
|
298 while ((@row) = $sth->fetchrow_array) {
|
|
299 print "6.7 - ".$row[0]." Number of distinct genomic regions that were mapped to\n";
|
|
300 $nr_count = $row[0];
|
|
301 }
|
|
302
|
|
303 #7.1
|
|
304 $sth = $dbh->prepare("create table bowtie_lib_$proj select distinct library,chromo, start,stop, pos, sum(number) as count,strand, orient from bowtie_$proj group by library,chromo,start,stop,pos,strand order by chromo,start+0,orient");
|
|
305 $sth->execute;
|
|
306
|
|
307 print "7.1 - created bowtie_lib table\n";
|
|
308
|
|
309 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj");
|
|
310 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj");
|
|
311 $sth->execute;
|
|
312 #7.2
|
|
313 open OUT, "> results/raw_$proj.BED";
|
|
314 print OUT "track type ='BED' name='raw$proj' description='all$proj' visibility=2 itemRgb='On'";
|
|
315 while ((@row) = $sth->fetchrow_array) {
|
|
316 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
|
|
317 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
|
|
318 close OUT;
|
|
319 #7.3
|
|
320 $sth = $dbh->prepare("drop table if exists convert_2_hex_$proj");
|
|
321 $sth->execute;
|
|
322 $sth = $dbh->prepare("create table convert_2_hex_$proj (orient varchar(5), hex varchar(20))");
|
|
323 $sth->execute;
|
|
324 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"A\", \"255,0,0\")");
|
|
325 $sth->execute;
|
|
326 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"B\", \"0,0,255\")");
|
|
327 $sth->execute;
|
|
328 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"U\", \"0,0,0\")");
|
|
329 $sth->execute;
|
|
330
|
|
331 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj where count > 9");
|
|
332 #select chromo, start, start+1, concat(library,count),count,'+',start,start+1, hex from bowtie_lib_$proj A, convert_2_hex B where A.orient = B.orient and count > 9");
|
|
333 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj where count > 9");
|
|
334 $sth->execute;
|
|
335 open OUT, "> results/raw10_$proj.BED";
|
|
336 print OUT "track type ='BED' name='raw10$proj' description='10$proj' visibility=2 itemRgb='On'";
|
|
337 while ((@row) = $sth->fetchrow_array) {
|
|
338 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
|
|
339 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
|
|
340 close OUT;
|
|
341
|
|
342 #8.1
|
|
343
|
|
344 $sth = $dbh->prepare("drop table if exists illumina_hist_$proj");
|
|
345 $sth->execute;
|
|
346
|
|
347 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
|
|
348 $sth->execute;
|
|
349
|
|
350 $sth = $dbh->prepare("create table illumina_hist_raw_$proj select library, chromo,round(pos,-2) as start, count,orient from bowtie_lib_$proj;");
|
|
351 $sth->execute;
|
|
352
|
|
353 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
|
|
354 $sth->execute;
|
|
355 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
|
|
356 $sth->execute;
|
|
357
|
|
358 $sth = $dbh->prepare("create table illumina_hist_$proj select distinct library, chromo,start, sum(count) as count,orient from illumina_hist_raw_$proj group by library,chromo,start,orient;");
|
|
359 $sth->execute;
|
|
360
|
|
361 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
|
|
362 $sth->execute;
|
|
363 $sth = $dbh->prepare("drop table if exists lib_count_$proj");
|
|
364 $sth->execute;
|
|
365 #8.2
|
|
366 $sth = $dbh->prepare("create table lib_count_$proj select distinct library, sum(count) as total from illumina_hist_$proj group by library;");
|
|
367 $sth->execute;
|
|
368
|
|
369 #8.3 the following was added to improve process. Rather then using the total mapped as the denominator in the nr ratio, the total mappable should be used. This will minimize the effect of the crossmapping problem by increasing the requirements for inclusion into the library nr lists sfor librarys wiht low percentage mappiung such as woud exist form mapping to the wrong species
|
|
370
|
|
371 $sth = $dbh->prepare("drop table if exists lib_m_$proj");
|
|
372 $sth->execute;
|
|
373 $sth = $dbh->prepare("create table lib_m_$proj select distinct library, sum(number) as total from illumina4dec2_$proj group by library");
|
|
374 $sth->execute;
|
|
375 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
|
|
376 $sth->execute;
|
|
377 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
|
|
378 $sth->execute;
|
|
379 $sth = $dbh->prepare("drop table if exists lib_mappable_$proj");
|
|
380 $sth->execute;
|
|
381 $sth = $dbh->prepare("create table lib_mappable_$proj select library, sum(total) as total from lib_m_$proj group by library");
|
|
382 $sth->execute;
|
|
383
|
|
384
|
|
385
|
|
386 print "completed mapping!!!";
|
|
387
|
|
388 ###
|
|
389 #9 Generate a report describing the mapping.
|
|
390 ###
|
|
391
|
|
392 open OUT, "> results/summary_$proj.txt";
|
|
393 print OUT "Summary of project $proj\n\nA.Report of project based counts \n";
|
|
394
|
|
395 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
|
|
396 $sth->execute;
|
|
397
|
|
398 while ((@row) = $sth->fetchrow_array) {
|
|
399 print OUT $row[0]." Total number of Seqeunces\n";
|
|
400 }
|
|
401
|
|
402 $sth = $dbh->prepare("select count(*) from illumina_decoded_$proj;");
|
|
403 $sth->execute;
|
|
404
|
|
405 while ((@row) = $sth->fetchrow_array) {
|
|
406 print OUT $row[0]." Total number of Sequences that map to a barcode\n";
|
|
407 }
|
|
408
|
|
409 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj");
|
|
410 $sth->execute;
|
|
411
|
|
412 while ((@row) = $sth->fetchrow_array) {
|
|
413 print OUT $row[0]." Total number of Sequences with a barcode that also have an IRDR\n";
|
|
414 }
|
|
415
|
|
416 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idR';");
|
|
417 $sth->execute;
|
|
418
|
|
419 while ((@row) = $sth->fetchrow_array) {
|
|
420 print OUT $row[0]." Total number of Ideal Right linker sequences\n";
|
|
421 }
|
|
422
|
|
423 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idL';");
|
|
424 $sth->execute;
|
|
425
|
|
426 while ((@row) = $sth->fetchrow_array) {
|
|
427 print OUT $row[0]." Total number of Ideal left linkersequences\n";
|
|
428 }
|
|
429
|
|
430
|
|
431
|
|
432 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'good';");
|
|
433 $sth->execute;
|
|
434
|
|
435 while ((@row) = $sth->fetchrow_array) {
|
|
436 print OUT $row[0]." Total number of good sequences\n";
|
|
437 }
|
|
438
|
|
439 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'bam';");
|
|
440 $sth->execute;
|
|
441
|
|
442 while ((@row) = $sth->fetchrow_array) {
|
|
443 print OUT $row[0]." Total number of Bamh1 sequence Artifact (removed)\n";
|
|
444 }
|
|
445
|
|
446 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'noTA';");
|
|
447 $sth->execute;
|
|
448
|
|
449 while ((@row) = $sth->fetchrow_array) {
|
|
450 print OUT $row[0]." Total number of sequences removed because of lack of TA sequence)\n";
|
|
451 }
|
|
452
|
|
453 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'min';");
|
|
454 $sth->execute;
|
|
455
|
|
456 while ((@row) = $sth->fetchrow_array) {
|
|
457 print OUT $row[0]." Total number of sequences removed because of sequence length < 24)\n";
|
|
458 }
|
|
459
|
|
460
|
|
461
|
|
462
|
|
463 $sth = $dbh->prepare("select count(*) from illumina4dec2_$proj;");
|
|
464 $sth->execute;
|
|
465
|
|
466 while ((@row) = $sth->fetchrow_array) {
|
|
467 print OUT $row[0]." Total number of unique seqeunces pryor to mapping\n";
|
|
468 }
|
|
469
|
|
470 $sth = $dbh->prepare("select sum(number) from bowtie_$proj;");
|
|
471 $sth->execute;
|
|
472 while ((@row) = $sth->fetchrow_array) {
|
|
473 print OUT $row[0]." Total SUM of seqeunces that mapped\n";
|
|
474 }
|
|
475
|
|
476 $sth = $dbh->prepare("select count(*) from bowtie_$proj;");
|
|
477 $sth->execute;
|
|
478
|
|
479 while ((@row) = $sth->fetchrow_array) {
|
|
480 print OUT $row[0]." Number of unique sequences that mapped\n";
|
|
481 }
|
|
482
|
|
483 $sth = $dbh->prepare("select count(*) from bowtie_lib_$proj;");
|
|
484 $sth->execute;
|
|
485
|
|
486 while ((@row) = $sth->fetchrow_array) {
|
|
487 print OUT $row[0]." Number of distinct locations mapped\n";
|
|
488 }
|
|
489
|
|
490 $sth = $dbh->prepare("select count(*) from illumina_hist_$proj;");
|
|
491 $sth->execute;
|
|
492
|
|
493 while ((@row) = $sth->fetchrow_array) {
|
|
494 print OUT $row[0]." Number of distinct regions mapped\n";
|
|
495 }
|
|
496 close OUT;
|
|
497
|
|
498 #report part B
|
|
499 open OUT, ">> results/summary_$proj.txt";
|
|
500 print OUT"\n\nB. Library counts associated with project $proj\n";
|
|
501
|
|
502 %sequence='';
|
|
503 %barcode_count='';
|
|
504 %IRDR_count='';
|
|
505 %IRDR_good='';
|
|
506 %IRDR_unique='';
|
|
507 %map_count='';
|
|
508 %map_total='';
|
|
509 %nr_count='';
|
|
510 @library='';
|
|
511
|
|
512 $sth = $dbh->prepare("select * from barcode_$proj");
|
|
513 $sth->execute;
|
|
514
|
|
515 while ((@row) = $sth->fetchrow_array) {
|
|
516 $sequence{$row[1]} = $row[0];
|
|
517 push(@library,$row[1]);
|
|
518 }
|
|
519
|
|
520 $sth = $dbh->prepare("select library, count(id) from illumina_decoded_$proj group by library");
|
|
521 $sth->execute;
|
|
522
|
|
523 while ((@row) = $sth->fetchrow_array) {
|
|
524 $barcode_count{$row[0]} = $row[1];
|
|
525 }
|
|
526
|
|
527 $sth = $dbh->prepare("select library, count(id) from illumina_without_IRDR_$proj group by library");
|
|
528 $sth->execute;
|
|
529
|
|
530 while ((@row) = $sth->fetchrow_array) {
|
|
531 $IRDR_count{$row[0]} = $row[1];
|
|
532 }
|
|
533
|
|
534
|
|
535
|
|
536 $sth = $dbh->prepare("select library, sum(number) from illumina4dec2_$proj group by library;");
|
|
537 $sth->execute;
|
|
538
|
|
539 while ((@row) = $sth->fetchrow_array) {
|
|
540 $IRDR_good{$row[0]} = $row[1];
|
|
541 }
|
|
542
|
|
543
|
|
544 $sth = $dbh->prepare("select library, count(number) from illumina4dec2_$proj group by library;");
|
|
545 $sth->execute;
|
|
546
|
|
547 while ((@row) = $sth->fetchrow_array) {
|
|
548 $IRDR_unique{$row[0]} = $row[1];
|
|
549 }
|
|
550
|
|
551 $sth = $dbh->prepare("select library, sum(number) from bowtie_$proj group by library");
|
|
552 $sth->execute;
|
|
553
|
|
554 while ((@row) = $sth->fetchrow_array) {
|
|
555 $map_count{$row[0]} = $row[1];
|
|
556 }
|
|
557
|
|
558 $sth = $dbh->prepare("select library, count(id) from bowtie_$proj group by library");
|
|
559 $sth->execute;
|
|
560
|
|
561 while ((@row) = $sth->fetchrow_array) {
|
|
562 $map_total{$row[0]} = $row[1];
|
|
563 }
|
|
564
|
|
565 $sth = $dbh->prepare("select library, count(chromo) from bowtie_lib_$proj group by library");
|
|
566 $sth->execute;
|
|
567
|
|
568 while ((@row) = $sth->fetchrow_array) {
|
|
569 $map_nr{$row[0]} = $row[1];
|
|
570 }
|
|
571
|
|
572 open (my $lib_stats, "> results/lib_stats_$proj.txt");
|
|
573 print OUT "Library\tBarcode Sequence\tBarcode\tIRDR\tMappable\tUnique mappable\tTotal map\tUnique map\tNR map\n";
|
|
574 print $lib_stats "Library,Barcode Sequence,with Barcode,with Mutagen,Total Mappable,Total Mapped,Mappable,Mapped,NR Inserts\n";
|
|
575 foreach $item (@library) {
|
|
576 print OUT "$item\t$sequence{$item}\t$barcode_count{$item}\t$IRDR_count{$item}\t$IRDR_good{$item}\t$IRDR_unique{$item}\t$map_count{$item}\t$map_total{$item}\t$map_nr{$item}\n";
|
|
577 if (length($item) == 0) { next; }
|
|
578 print $lib_stats "$item,$sequence{$item}";
|
|
579 &print_stat_val($lib_stats, \$barcode_count{$item});
|
|
580 &print_stat_val($lib_stats, \$IRDR_count{$item});
|
|
581 &print_stat_val($lib_stats, \$IRDR_good{$item});
|
|
582 &print_stat_val($lib_stats, \$IRDR_unique{$item});
|
|
583 &print_stat_val($lib_stats, \$map_count{$item});
|
|
584 &print_stat_val($lib_stats, \$map_total{$item});
|
|
585 &print_stat_val($lib_stats, \$map_nr{$item});
|
|
586 print $lib_stats "\n";
|
|
587 }
|
|
588 close $lib_stats;
|
|
589
|
|
590 print OUT "\n\nC. Regions associated with project $proj\n";
|
|
591
|
|
592 %mappable='';
|
|
593 %name='';
|
|
594 %count01='';
|
|
595 %count001='';
|
|
596 %count0001='';
|
|
597 %count0='';
|
|
598 @library='';
|
|
599
|
|
600 $sth = $dbh->prepare("select * from lib_count_$proj");
|
|
601 $sth->execute;
|
|
602
|
|
603 while ((@row) = $sth->fetchrow_array) {
|
|
604 $name{$row[0]} = $row[1];
|
|
605 push(@library,$row[0]);
|
|
606 }
|
|
607
|
|
608 $sth = $dbh->prepare("select * from lib_mappable_$proj");
|
|
609 $sth->execute;
|
|
610
|
|
611 while ((@row) = $sth->fetchrow_array) {
|
|
612 $mappable{$row[0]} = $row[1];
|
|
613 }
|
|
614
|
|
615 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .01 group by A.library order by chromo,start+0");
|
|
616 $sth->execute;
|
|
617
|
|
618 while ((@row) = $sth->fetchrow_array) {
|
|
619 $count01{$row[0]} = $row[1];
|
|
620 }
|
|
621
|
|
622 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .001 group by A.library order by chromo,start+0");
|
|
623 $sth->execute;
|
|
624
|
|
625 while ((@row) = $sth->fetchrow_array) {
|
|
626 $count001{$row[0]} = $row[1];
|
|
627 }
|
|
628
|
|
629 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .0001 group by A.library order by chromo,start+0");
|
|
630 $sth->execute;
|
|
631
|
|
632 while ((@row) = $sth->fetchrow_array) {
|
|
633 $count0001{$row[0]} = $row[1];
|
|
634 }
|
|
635
|
|
636 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > 0 group by A.library order by chromo,start+0");
|
|
637 $sth->execute;
|
|
638
|
|
639 open my $region_stats, "> results/region_stats_$proj.txt";
|
|
640 print OUT "Library\tTotal mappable\ttotal map\tLocations >0.01\tLocations >0.001\tLocations >0.0001\t All mapped Locations";
|
|
641 print $region_stats "Library,Total mappable,total map,Locations >0.01,Locations >0.001,Locations >0.0001,All mapped Locations\n";
|
|
642 while ((@row) = $sth->fetchrow_array) {
|
|
643 $count0{$row[0]} = $row[1];
|
|
644 }
|
|
645
|
|
646 foreach $item (@library) {
|
|
647 print OUT "$item\t$mappable{$item}\t$name{$item}\t$count01{$item}\t$count001{$item}\t$count0001{$item}\t$count0{$item}\n";
|
|
648 if (length($item) == 0) { next; }
|
|
649 print $region_stats "$item";
|
|
650 &print_stat_val($region_stats, \$mappable{$item});
|
|
651 &print_stat_val($region_stats, \$name{$item});
|
|
652 &print_stat_val($region_stats, \$count01{$item});
|
|
653 &print_stat_val($region_stats, \$count001{$item});
|
|
654 &print_stat_val($region_stats, \$count0001{$item});
|
|
655 &print_stat_val($region_stats, \$count0{$item});
|
|
656 print $region_stats "\n";
|
|
657 }
|
|
658 close OUT;
|
|
659 close $region_stats;
|
|
660
|
|
661 &set_project_status($proj, 1, 0);
|
|
662
|
|
663 exit(0);
|
|
664
|
|
665 sub add_default_quality {
|
|
666 print "Adding default quality values.";
|
|
667 open($input, "<", "data/seqs.tab") || die "Unable to open ${$in_fn_ref}: $!\n";
|
|
668 open($output, ">", "data/seqs_w_qual.tab") || die "Unable to open ${$out_fn_ref}: $!\n";
|
|
669 while(<$input>) {
|
|
670 chomp;
|
|
671 my @split = split("\t", $_);
|
|
672 if ($#split == 1) {
|
|
673 print $output join("\t", $split[0], "", $split[1]);
|
|
674 $split[1] =~ s/./I/g;
|
|
675 print $output sprintf("\t%s\n", $split[1]);
|
|
676 }
|
|
677 elsif ($#split == 2) {
|
|
678 print $output join("\t", $split[0], $split[1], $split[2]);
|
|
679 $split[2] =~ s/./I/g;
|
|
680 print $output sprintf("\t%s\n", $split[2]);
|
|
681 }
|
|
682 }
|
|
683 copy("data/seqs.tab", "data/original_seqs.tab") || die "Unable to move seqs.tab: $!\n";
|
|
684 copy("data/seqs_w_qual.tab", "data/seqs.tab") || die "Unable to move seqs.tab: $!\n";
|
|
685 }
|
|
686 sub alignment {
|
|
687 unless (-d "mapping") {
|
|
688 mkdir("mapping");
|
|
689 }
|
|
690 my @seq_lengths = (-1, 33, 30, 28, 24);
|
|
691 my @bowtie_mismatches = (3, 3, 2, 1, 0);
|
|
692
|
|
693 for (my $i = 0; $i <= $#seq_lengths; $i++) {
|
|
694 if($aligner eq "bwa") {
|
|
695 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i.txt");
|
|
696 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
|
|
697 }
|
|
698 elsif($aligner eq "bow_bwa") {
|
|
699 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
|
|
700 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", 1);
|
|
701 print "FINISHED Bowtie iteration #$i\n";
|
|
702 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i_part2.txt");
|
|
703 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i_part2.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i_part2.txt", $i != $#bowtie_mismatches);
|
|
704 }
|
|
705 else {
|
|
706 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
|
|
707 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
|
|
708 print "FINISHED Bowtie iteration #$i\n";
|
|
709 }
|
|
710 }
|
|
711 }
|
|
712
|
|
713 sub bowtie {
|
|
714 my ($mismatches, $input_file, $output_file, $tidy) = @_;
|
|
715 print "bowtie($mismatches, $input_file, $output_file, $tidy)\n";
|
|
716 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
|
|
717 $sth->execute;
|
|
718
|
|
719 my $cmd = "$bowtie_exe --quiet -a --best --strata -v $mismatches -m 1 --suppress 5,6,7 -f ";
|
|
720 #if ($quality) { $cmd = $cmd . "-q "; }
|
|
721 #else { $cmd = $cmd . "-f "; }
|
|
722 $cmd = $cmd . "$bowtie_idx $input_file $output_file |";
|
|
723 print "bowtie cmd: $cmd\n";
|
|
724 open($bowtie_out, $cmd);
|
|
725 close($bowtie_out);
|
|
726
|
|
727 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
|
|
728 $sth->execute;
|
|
729
|
|
730 #Clean out mapped entries for the next round of fast(a|q) generation
|
|
731 if ($tidy) {
|
|
732 $sth = $dbh->prepare("drop table if exists temp_$proj ");
|
|
733 $sth->execute;
|
|
734
|
|
735 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
|
|
736 $sth->execute;
|
|
737
|
|
738 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
|
|
739 $sth->execute;
|
|
740 }
|
|
741 }
|
|
742
|
|
743 sub bwa {
|
|
744 my ($mismatches, $input_file, $output_aln_file, $output_samse_file, $output_file, $tidy) = @_;
|
|
745 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
|
|
746 $sth->execute;
|
|
747
|
|
748 my $cmd = "$bwa_exe aln -n $mismatches -M 1 $bwa_idx $input_file > $output_aln_file | ";
|
|
749 open ($bwa_out, $cmd);
|
|
750 while(<$bwa_out>) { print $_; }
|
|
751 close($bwa_out);
|
|
752 $cmd = "$bwa_exe samse $bwa_idx $output_aln_file $input_file > $output_samse_file | ";
|
|
753 open ($bwa_out, $cmd);
|
|
754 while(<$bwa_out>) { print $_; }
|
|
755 close($bwa_out);
|
|
756
|
|
757 open ($sam, "<", $output_samse_file) || die "Unable to open samse output file, $output_samse_file: $!\n";
|
|
758 open ($final_out, ">", $output_file) || die "Unable to open final output file, $output_file: $!\n";
|
|
759 while(<$sam>) {
|
|
760 chomp;
|
|
761 if (/^@/) { next; }
|
|
762 my @sam_array = split("\t", $_);
|
|
763 if ($#sam_array < 11 || $sam_array[2] eq '*') { next; }
|
|
764 my $strand = '+';
|
|
765 if ($sam_array[1] | 0b11110111111) { $strand = '-'; }
|
|
766 my @opts = split(" ", $sam_array[11]);
|
|
767 foreach (@opts) {
|
|
768 if (/NM:i:([0-9]*)/) {
|
|
769 if ($1 > $mismatches) { next; }
|
|
770 }
|
|
771 elsif (/X0:i:([0-9]*)/) {
|
|
772 if ($1 > 1) { next; }
|
|
773 }
|
|
774 }
|
|
775 #print "CHROM: $sam_array[2]";
|
|
776 #$sam_array[2] =~ s/"SN:"//g;
|
|
777 #print "\t$sam_array[2]\n";
|
|
778 print $final_out join("\t", $sam_array[0], $strand, $chrom_map_ref->{$sam_array[2]}, $sam_array[3]-1, $sam_array[4]) . "\n";
|
|
779 }
|
|
780 close($final_out);
|
|
781 close($sam);
|
|
782
|
|
783 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
|
|
784 $sth->execute;
|
|
785
|
|
786 #Clean out mapped entries for the next round of fast(a|q) generation
|
|
787 if ($tidy) {
|
|
788 $sth = $dbh->prepare("drop table if exists temp_$proj ");
|
|
789 $sth->execute;
|
|
790
|
|
791 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
|
|
792 $sth->execute;
|
|
793
|
|
794 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
|
|
795 $sth->execute;
|
|
796 }
|
|
797 }
|
|
798
|
|
799 sub prepare_fasta {
|
|
800 my ($length, $mapping_file) = @_;
|
|
801 print "prepare_fasta($length, $mapping_file)\n";
|
|
802 if ($length <= 0) {
|
|
803 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence from illumina4dec_$proj where length(sequence) > 33");
|
|
804 }
|
|
805 else {
|
|
806 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length) from illumina4dec_$proj where length(sequence) > $length-1");
|
|
807 }
|
|
808 $sth->execute;
|
|
809
|
|
810 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
|
|
811 while ((@row) = $sth->fetchrow_array) {
|
|
812 print OUT ">$row[0]\n$row[1]\n";
|
|
813 }
|
|
814 close OUT;
|
|
815 }
|
|
816
|
|
817 sub prepare_fastq {
|
|
818 my ($length, $mapping_file) = @_;
|
|
819 if ($legnth <= 0) {
|
|
820 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence,quality from illumina4dec_$proj where length(sequence) > 33");
|
|
821 }
|
|
822 else {
|
|
823 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length),left(quality,$length) from illumina4dec_$proj where length(sequence) > $length-1");
|
|
824 }
|
|
825 $sth->execute;
|
|
826
|
|
827 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
|
|
828 while ((@row) = $sth->fetchrow_array) {
|
|
829 print OUT sprintf("\@%s\n%s\n\+%s\n%s\n", $row[0], $row[1], $row[0], substr($row[2], 0, length($row[1])));
|
|
830 }
|
|
831 close OUT;
|
|
832 }
|
|
833
|
|
834 sub print_stat_val {
|
|
835 my ($fh_ptr, $val_ptr) = @_;
|
|
836 if (defined($val_ptr) && ${$val_ptr} > 0) {
|
|
837 print $fh_ptr ",${$val_ptr}";
|
|
838 } else {
|
|
839 print $fh_ptr ",0";
|
|
840 }
|
|
841 }
|