comparison lib/TAPDANCE.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1437a2df99c0
1 # Description: Processing of transposon instertion sequences.
2 # Requirements perl DBI read write accessible database
3 # Requires a file labeled barcode.txt, and a tab delimited file contatining seqeunces.
4 # Should work on files of less than 40 million sequences.
5 # Identifies and removes barcodes, IRDR sequences and linker sequences from sequencing data, then compiles a list of non redundant sequences.
6 # This scripts conducts the PAPS steps 1
7 # Aaron Lyman Sarver July 12 2010
8 # version 3.1 December 1 2010
9
10 use File::Copy;
11 require "config.pl";
12
13 my $path = $0;
14 $path =~ s/\/\w*\.pl$//g;
15 print $path . "\n";
16 require "$path/util.pl";
17 require "$path/project_man.pl";
18
19 my $chrom_map_ref = &get_chrom_map();
20
21 print join("\t", keys %{$chrom_map_ref}) . "\n";
22
23 unless (-d "results") {
24 mkdir("results");
25 }
26
27 #Remove old data if it exists
28 $sth = $dbh->prepare("drop table if exists illumina_raw_$proj");
29 $sth->execute;
30 $sth = $dbh->prepare("drop table if exists barcode_$proj");
31 $sth->execute;
32 $sth = $dbh->prepare("drop table if exists illumina_decoded_$proj");
33 $sth->execute;
34 $sth = $dbh->prepare("drop table if exists illumina_without_IRDR_$proj");
35 $sth->execute;
36 $sth = $dbh->prepare("drop table if exists illumina_nr_pre_map_$proj");
37 $sth->execute;
38 $sth = $dbh->prepare("drop table if exists illumina4dec_$proj");
39 $sth->execute;
40 $sth = $dbh->prepare("drop table if exists illumina4dec2_$proj");
41 $sth->execute;
42 $sth = $dbh->prepare("drop table if exists illumina_blastout_$proj");
43 $sth->execute;
44
45 #1.1 Part 1 load database
46 print "Started PROCESSING\n Tables identified by the postfix $proj in the database";
47 #Determine number of columns in seqs.tab
48 open(SEQS, "<", "data/seqs.tab");
49 my @cols = split("\t", <SEQS>);
50 close(SEQS);
51 print join(":", @cols);
52 print "\n";
53 if ($#cols <= 2) {
54 &add_default_quality();
55 }
56 $sth = $dbh->prepare("create table illumina_raw_$proj ( id varchar(20), description varchar(20), sequence varchar(100), quality varchar(100) ) ");
57 $sth->execute;
58
59 print "generated illumina_raw table\n";
60
61 $sth = $dbh->prepare("load DATA local INFILE 'data/seqs.tab' INTO TABLE illumina_raw_$proj ");
62 $sth->execute;
63
64 print "1.1 complete - Loaded illumina_raw &r/m data/seqS.tab file \n";
65
66 #1.2
67 $sth = $dbh->prepare("create table barcode_$proj ( seq varchar(20), library varchar(30), direction varchar(20) )");
68 $sth->execute;
69
70 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
71
72 $sth = $dbh->prepare("load DATA local INFILE 'data/barcode2lib.txt' INTO TABLE barcode_$proj");
73 $sth->execute;
74
75 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
76
77 #1.3
78 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
79 $sth->execute;
80
81 while ((@row) = $sth->fetchrow_array) {
82 print $row[0]." Number of lines in raw data file\n";
83 }
84
85 $sth = $dbh->prepare("select count(distinct id) from illumina_raw_$proj");
86 $sth->execute;
87
88 while ((@row) = $sth->fetchrow_array) {
89 print $row[0]." Number of unique ids in illumina raw\n";
90 }
91 #1.4
92 $sth = $dbh->prepare("select count(*) from barcode_$proj");
93 $sth->execute;
94
95 while ((@row) = $sth->fetchrow_array) {
96 print $row[0]." Number of lines in barcode \n";
97 }
98
99 #2.1 map the barcodes to the sequences..migrated to functionin config.pl.
100 my $barcode_length = 0;
101 open (BARCODE, "<", "data/barcode2lib.txt") || die "Unable to open barcode to library file. $!\n";
102 while (<BARCODE>) {
103 chomp;
104 my @split = split("\t", $_);
105 if (length($split[0]) > $barcode_length) { $barcode_length = length($split[0]); }
106 }
107 close(BARCODE);
108
109 $sth = $dbh->prepare("create table illumina_decoded_$proj select library, id, substring(sequence,$barcode_length+1) as decoded_sequence, substring(quality, $barcode_length+1) as decoded_quality from barcode_$proj,illumina_raw_$proj where sequence like concat(seq,'%')");
110 $sth->execute;
111 print "Mapped barcodes to sequences in table illumina_decoded2\n";
112 print "2.1 complete - Mapped barcodes to sequences in table illumina_decoded\n";
113
114 #2.2
115 $sth = $dbh->prepare("select distinct library, count(id) from illumina_decoded_$proj group by library;");
116 $sth->execute;
117
118 while ((@row) = $sth->fetchrow_array) {
119 print $row[0]."\t".$row[1]." inserts\n";
120 }
121
122 #2.3
123 if (!defined($mutagens)) {
124 #454
125 $mutagens="__GTATGTAAACTTCCGACTTCAACTG";
126 #illumina data
127 #$mutagens = "TGTATGTAAACTTCCGACTTCAACTG";
128 #illumina_2 data
129 #$mutagens = "___TGTATGTAAACTTCCGACTTCAACTG";
130 #MULV
131 #$mutagens = "CCAAACCTACAGGTGGGGTCTTTCA";
132 }
133
134 my @mutagens_array = split(",", $mutagens);
135 my $first = 1;
136
137 foreach my $mutagen (@mutagens_array) {
138 $mutagen =~ s/^\s+//;
139 $mutagen =~ s/\s+$//;
140 if ($first) {
141 $sql_str = "create table ";
142 $first = 0;
143 } else {
144 $sql_str = "insert into ";
145 }
146 $sql_str = join("", $sql_str, "illumina_without_IRDR_$proj select library,id,substring(decoded_sequence,", length($mutagen)+1, ") as insertion_sequence, substring(decoded_quality, ", length($mutagen)+1, ") as insertion_quality, 'good' as type from illumina_decoded_$proj where decoded_sequence like '", $mutagen, "%'");
147 $sth = $dbh->prepare($sql_str);
148 $sth->execute;
149 }
150 print "2.3 - Mapped and removed IRDR sequences, created table illumina_without_IRDR\n";
151
152 # 3.1 Find and Remove internal linker
153
154 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idR' where insertion_sequence like '%GTCCCTTAAGCGGAGCC%'");
155 $sth->execute;
156 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'GTCCCTTAAGCGGAGCC',1) where type = 'idR';");
157 print "3.1 - Identified RIght linker in remaining seqeunces and renamed to idR.\n";
158
159 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idL' where insertion_sequence like '%TAGTCCCTTAAGCGGAG%'");
160 $sth->execute;
161 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'TAGTCCCTTAAGCGGAG',1) where type = 'idL';");
162 $sth->execute;
163 print "3.2 - Identified Left linker in remaining seqeunces and renamed to idL.\n";
164
165 # Part 3.3 Find and Remove GGATCC sequences
166 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'bam' where insertion_sequence like '_____GGATCC%'");
167 $sth->execute;
168 print "3.3 - Identified seqeunce with GGATCC and renamed.\n";
169
170 # Part 3.4 Find and remove any remaining seqeunces that do not start with TA seqeunce
171 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'noTA' where insertion_sequence not like 'TA%'");
172 $sth->execute;
173 print "3.4 - Identified seqeunce without TA and renamed to noTA\n";
174
175 # 3.5 Find and identify any sequences that are two short for meaningful mapping
176 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'min' where (type = 'good' or type = 'idR' or type = 'idL') and char_length(insertion_sequence)<24");
177 $sth->execute;
178 print "3.5 Identified short seqeunces and label them min\n";
179
180 #4.1
181 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (type)");
182 $sth->execute;
183
184 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_sequence)");
185 $sth->execute;
186
187 #if ($quality) {
188 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_quality)");
189 $sth->execute;
190 print "indexed table illumina_without_IRDR\n";
191 #}
192
193 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (library)");
194 $sth->execute;
195 print "4.1 - indexed table illumina_without_IRDR\n";
196
197 ####
198 #4.2 Remove duplicate sequences maintaining count of all
199 ####
200 $sth = $dbh->prepare("create table illumina_nr_pre_map_$proj as select distinct count(type) as number, insertion_sequence, insertion_quality, library from illumina_without_IRDR_$proj where (type = 'good' or type = 'idR' or type = 'idL') group by library, insertion_sequence, insertion_quality");
201 $sth->execute;
202
203 print "4.2 - Created illumina_nr_pre_map table\n";
204
205 #4.3
206 $sth = $dbh->prepare("create table illumina4dec_$proj (id INT unsigned not null auto_increment, primary key(ID), number varchar(20), library varchar(30),sequence varchar(100), quality varchar(100) )");
207 $sth->execute;
208 print "4.3 - Created illumina4dec table\n";
209
210 #if ($quality) {
211 $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence, insertion_quality from illumina_nr_pre_map_$proj");
212 #}
213 #else {
214 # $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence from illumina_nr_pre_map_$proj");
215 #}
216 $sth->execute;
217
218 print "4.4 - populated nr_id table\n";
219
220
221 $sth = $dbh->prepare("create table illumina4dec2_$proj select * from illumina4dec_$proj");
222 $sth->execute;
223 print "4.5 - Copied illumina4dec table to illumina4dec2 for later use\n";
224
225 ####
226 # Bowtie variable definitions
227 ####
228 if (length($bowtie_exe) == 0) { $bowtie_exe = "/project/ccbioinf/Projects/labs/largaespada/Illumina_data/bowtie-0.12.5/bowtie"; }
229 if (length($bowtie_idx) == 0) { $bowtie_idx = "mm9"; }
230 if (length($envDir) == 0) { $envDir = "."; }
231
232 ####
233 #5 This section of the script runs the bowtie algorythm.
234 ####
235 &alignment();
236 print "5.1c - FINISHED Bowtie iteration #1\n";
237
238 $sth = $dbh->prepare("drop table if exists bowtie_$proj ");
239 $sth->execute;
240 $sth = $dbh->prepare("drop table if exists bowtie_lib_$proj ");
241 $sth->execute;
242
243 #####
244 # 6 reduce to NR set and load library and location mapping.
245 ######
246 $sth = $dbh->prepare("create table bowtie_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,':',-2),':',1) as library, substring_index(substring_index(B.id,':',-1),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B");
247 $sth->execute;
248
249 $sth = $dbh->prepare("alter table bowtie_$proj add pos varchar(20)");
250 $sth->execute;
251 $sth = $dbh->prepare("alter table bowtie_$proj add stop varchar(20)");
252 $sth->execute;
253
254 $sth = $dbh->prepare("update bowtie_$proj set pos = size + start where strand = '-'");
255 $sth->execute;
256
257 $sth = $dbh->prepare("update bowtie_$proj set pos = start + 1 where strand = '+'");
258 $sth->execute;
259
260 $sth = $dbh->prepare("update bowtie_$proj set stop = start + size");
261 $sth->execute;
262
263 print "6.1 6.2 6.3 6.4 - created bowtie table and adjusted \n";
264 ###
265 #6.5 determine transposon orientation (is it in the A or B orientation?)
266 ###
267
268 $sth = $dbh->prepare("alter table bowtie_$proj add orient varchar(5) default 'U'");
269 $sth->execute;
270
271 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-L' and strand = '+'");
272 $sth->execute;
273
274 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-L' and strand = '-'");
275 $sth->execute;
276
277 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-R' and strand = '-'");
278 $sth->execute;
279
280 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-R' and strand = '+'");
281 $sth->execute;
282
283
284
285 print "6.5 - determined mapping orientation\n";
286
287 #6.6
288 $sth = $dbh->prepare("select count(*) from temp_$proj");
289 $sth->execute;
290
291 while ((@row) = $sth->fetchrow_array) {
292 print "6.6 - ".$row[0]." Number of unique seqeunces that mapped\n";
293 }
294 #6.7
295 $sth = $dbh->prepare("select count(distinct chromo, start) from temp_$proj order by chromo, start+0; ");
296 $sth->execute;
297
298 while ((@row) = $sth->fetchrow_array) {
299 print "6.7 - ".$row[0]." Number of distinct genomic regions that were mapped to\n";
300 $nr_count = $row[0];
301 }
302
303 #7.1
304 $sth = $dbh->prepare("create table bowtie_lib_$proj select distinct library,chromo, start,stop, pos, sum(number) as count,strand, orient from bowtie_$proj group by library,chromo,start,stop,pos,strand order by chromo,start+0,orient");
305 $sth->execute;
306
307 print "7.1 - created bowtie_lib table\n";
308
309 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj");
310 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj");
311 $sth->execute;
312 #7.2
313 open OUT, "> results/raw_$proj.BED";
314 print OUT "track type ='BED' name='raw$proj' description='all$proj' visibility=2 itemRgb='On'";
315 while ((@row) = $sth->fetchrow_array) {
316 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
317 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
318 close OUT;
319 #7.3
320 $sth = $dbh->prepare("drop table if exists convert_2_hex_$proj");
321 $sth->execute;
322 $sth = $dbh->prepare("create table convert_2_hex_$proj (orient varchar(5), hex varchar(20))");
323 $sth->execute;
324 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"A\", \"255,0,0\")");
325 $sth->execute;
326 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"B\", \"0,0,255\")");
327 $sth->execute;
328 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"U\", \"0,0,0\")");
329 $sth->execute;
330
331 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj where count > 9");
332 #select chromo, start, start+1, concat(library,count),count,'+',start,start+1, hex from bowtie_lib_$proj A, convert_2_hex B where A.orient = B.orient and count > 9");
333 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj where count > 9");
334 $sth->execute;
335 open OUT, "> results/raw10_$proj.BED";
336 print OUT "track type ='BED' name='raw10$proj' description='10$proj' visibility=2 itemRgb='On'";
337 while ((@row) = $sth->fetchrow_array) {
338 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
339 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
340 close OUT;
341
342 #8.1
343
344 $sth = $dbh->prepare("drop table if exists illumina_hist_$proj");
345 $sth->execute;
346
347 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
348 $sth->execute;
349
350 $sth = $dbh->prepare("create table illumina_hist_raw_$proj select library, chromo,round(pos,-2) as start, count,orient from bowtie_lib_$proj;");
351 $sth->execute;
352
353 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
354 $sth->execute;
355 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
356 $sth->execute;
357
358 $sth = $dbh->prepare("create table illumina_hist_$proj select distinct library, chromo,start, sum(count) as count,orient from illumina_hist_raw_$proj group by library,chromo,start,orient;");
359 $sth->execute;
360
361 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
362 $sth->execute;
363 $sth = $dbh->prepare("drop table if exists lib_count_$proj");
364 $sth->execute;
365 #8.2
366 $sth = $dbh->prepare("create table lib_count_$proj select distinct library, sum(count) as total from illumina_hist_$proj group by library;");
367 $sth->execute;
368
369 #8.3 the following was added to improve process. Rather then using the total mapped as the denominator in the nr ratio, the total mappable should be used. This will minimize the effect of the crossmapping problem by increasing the requirements for inclusion into the library nr lists sfor librarys wiht low percentage mappiung such as woud exist form mapping to the wrong species
370
371 $sth = $dbh->prepare("drop table if exists lib_m_$proj");
372 $sth->execute;
373 $sth = $dbh->prepare("create table lib_m_$proj select distinct library, sum(number) as total from illumina4dec2_$proj group by library");
374 $sth->execute;
375 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
376 $sth->execute;
377 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
378 $sth->execute;
379 $sth = $dbh->prepare("drop table if exists lib_mappable_$proj");
380 $sth->execute;
381 $sth = $dbh->prepare("create table lib_mappable_$proj select library, sum(total) as total from lib_m_$proj group by library");
382 $sth->execute;
383
384
385
386 print "completed mapping!!!";
387
388 ###
389 #9 Generate a report describing the mapping.
390 ###
391
392 open OUT, "> results/summary_$proj.txt";
393 print OUT "Summary of project $proj\n\nA.Report of project based counts \n";
394
395 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
396 $sth->execute;
397
398 while ((@row) = $sth->fetchrow_array) {
399 print OUT $row[0]." Total number of Seqeunces\n";
400 }
401
402 $sth = $dbh->prepare("select count(*) from illumina_decoded_$proj;");
403 $sth->execute;
404
405 while ((@row) = $sth->fetchrow_array) {
406 print OUT $row[0]." Total number of Sequences that map to a barcode\n";
407 }
408
409 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj");
410 $sth->execute;
411
412 while ((@row) = $sth->fetchrow_array) {
413 print OUT $row[0]." Total number of Sequences with a barcode that also have an IRDR\n";
414 }
415
416 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idR';");
417 $sth->execute;
418
419 while ((@row) = $sth->fetchrow_array) {
420 print OUT $row[0]." Total number of Ideal Right linker sequences\n";
421 }
422
423 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idL';");
424 $sth->execute;
425
426 while ((@row) = $sth->fetchrow_array) {
427 print OUT $row[0]." Total number of Ideal left linkersequences\n";
428 }
429
430
431
432 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'good';");
433 $sth->execute;
434
435 while ((@row) = $sth->fetchrow_array) {
436 print OUT $row[0]." Total number of good sequences\n";
437 }
438
439 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'bam';");
440 $sth->execute;
441
442 while ((@row) = $sth->fetchrow_array) {
443 print OUT $row[0]." Total number of Bamh1 sequence Artifact (removed)\n";
444 }
445
446 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'noTA';");
447 $sth->execute;
448
449 while ((@row) = $sth->fetchrow_array) {
450 print OUT $row[0]." Total number of sequences removed because of lack of TA sequence)\n";
451 }
452
453 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'min';");
454 $sth->execute;
455
456 while ((@row) = $sth->fetchrow_array) {
457 print OUT $row[0]." Total number of sequences removed because of sequence length < 24)\n";
458 }
459
460
461
462
463 $sth = $dbh->prepare("select count(*) from illumina4dec2_$proj;");
464 $sth->execute;
465
466 while ((@row) = $sth->fetchrow_array) {
467 print OUT $row[0]." Total number of unique seqeunces pryor to mapping\n";
468 }
469
470 $sth = $dbh->prepare("select sum(number) from bowtie_$proj;");
471 $sth->execute;
472 while ((@row) = $sth->fetchrow_array) {
473 print OUT $row[0]." Total SUM of seqeunces that mapped\n";
474 }
475
476 $sth = $dbh->prepare("select count(*) from bowtie_$proj;");
477 $sth->execute;
478
479 while ((@row) = $sth->fetchrow_array) {
480 print OUT $row[0]." Number of unique sequences that mapped\n";
481 }
482
483 $sth = $dbh->prepare("select count(*) from bowtie_lib_$proj;");
484 $sth->execute;
485
486 while ((@row) = $sth->fetchrow_array) {
487 print OUT $row[0]." Number of distinct locations mapped\n";
488 }
489
490 $sth = $dbh->prepare("select count(*) from illumina_hist_$proj;");
491 $sth->execute;
492
493 while ((@row) = $sth->fetchrow_array) {
494 print OUT $row[0]." Number of distinct regions mapped\n";
495 }
496 close OUT;
497
498 #report part B
499 open OUT, ">> results/summary_$proj.txt";
500 print OUT"\n\nB. Library counts associated with project $proj\n";
501
502 %sequence='';
503 %barcode_count='';
504 %IRDR_count='';
505 %IRDR_good='';
506 %IRDR_unique='';
507 %map_count='';
508 %map_total='';
509 %nr_count='';
510 @library='';
511
512 $sth = $dbh->prepare("select * from barcode_$proj");
513 $sth->execute;
514
515 while ((@row) = $sth->fetchrow_array) {
516 $sequence{$row[1]} = $row[0];
517 push(@library,$row[1]);
518 }
519
520 $sth = $dbh->prepare("select library, count(id) from illumina_decoded_$proj group by library");
521 $sth->execute;
522
523 while ((@row) = $sth->fetchrow_array) {
524 $barcode_count{$row[0]} = $row[1];
525 }
526
527 $sth = $dbh->prepare("select library, count(id) from illumina_without_IRDR_$proj group by library");
528 $sth->execute;
529
530 while ((@row) = $sth->fetchrow_array) {
531 $IRDR_count{$row[0]} = $row[1];
532 }
533
534
535
536 $sth = $dbh->prepare("select library, sum(number) from illumina4dec2_$proj group by library;");
537 $sth->execute;
538
539 while ((@row) = $sth->fetchrow_array) {
540 $IRDR_good{$row[0]} = $row[1];
541 }
542
543
544 $sth = $dbh->prepare("select library, count(number) from illumina4dec2_$proj group by library;");
545 $sth->execute;
546
547 while ((@row) = $sth->fetchrow_array) {
548 $IRDR_unique{$row[0]} = $row[1];
549 }
550
551 $sth = $dbh->prepare("select library, sum(number) from bowtie_$proj group by library");
552 $sth->execute;
553
554 while ((@row) = $sth->fetchrow_array) {
555 $map_count{$row[0]} = $row[1];
556 }
557
558 $sth = $dbh->prepare("select library, count(id) from bowtie_$proj group by library");
559 $sth->execute;
560
561 while ((@row) = $sth->fetchrow_array) {
562 $map_total{$row[0]} = $row[1];
563 }
564
565 $sth = $dbh->prepare("select library, count(chromo) from bowtie_lib_$proj group by library");
566 $sth->execute;
567
568 while ((@row) = $sth->fetchrow_array) {
569 $map_nr{$row[0]} = $row[1];
570 }
571
572 open (my $lib_stats, "> results/lib_stats_$proj.txt");
573 print OUT "Library\tBarcode Sequence\tBarcode\tIRDR\tMappable\tUnique mappable\tTotal map\tUnique map\tNR map\n";
574 print $lib_stats "Library,Barcode Sequence,with Barcode,with Mutagen,Total Mappable,Total Mapped,Mappable,Mapped,NR Inserts\n";
575 foreach $item (@library) {
576 print OUT "$item\t$sequence{$item}\t$barcode_count{$item}\t$IRDR_count{$item}\t$IRDR_good{$item}\t$IRDR_unique{$item}\t$map_count{$item}\t$map_total{$item}\t$map_nr{$item}\n";
577 if (length($item) == 0) { next; }
578 print $lib_stats "$item,$sequence{$item}";
579 &print_stat_val($lib_stats, \$barcode_count{$item});
580 &print_stat_val($lib_stats, \$IRDR_count{$item});
581 &print_stat_val($lib_stats, \$IRDR_good{$item});
582 &print_stat_val($lib_stats, \$IRDR_unique{$item});
583 &print_stat_val($lib_stats, \$map_count{$item});
584 &print_stat_val($lib_stats, \$map_total{$item});
585 &print_stat_val($lib_stats, \$map_nr{$item});
586 print $lib_stats "\n";
587 }
588 close $lib_stats;
589
590 print OUT "\n\nC. Regions associated with project $proj\n";
591
592 %mappable='';
593 %name='';
594 %count01='';
595 %count001='';
596 %count0001='';
597 %count0='';
598 @library='';
599
600 $sth = $dbh->prepare("select * from lib_count_$proj");
601 $sth->execute;
602
603 while ((@row) = $sth->fetchrow_array) {
604 $name{$row[0]} = $row[1];
605 push(@library,$row[0]);
606 }
607
608 $sth = $dbh->prepare("select * from lib_mappable_$proj");
609 $sth->execute;
610
611 while ((@row) = $sth->fetchrow_array) {
612 $mappable{$row[0]} = $row[1];
613 }
614
615 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .01 group by A.library order by chromo,start+0");
616 $sth->execute;
617
618 while ((@row) = $sth->fetchrow_array) {
619 $count01{$row[0]} = $row[1];
620 }
621
622 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .001 group by A.library order by chromo,start+0");
623 $sth->execute;
624
625 while ((@row) = $sth->fetchrow_array) {
626 $count001{$row[0]} = $row[1];
627 }
628
629 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .0001 group by A.library order by chromo,start+0");
630 $sth->execute;
631
632 while ((@row) = $sth->fetchrow_array) {
633 $count0001{$row[0]} = $row[1];
634 }
635
636 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > 0 group by A.library order by chromo,start+0");
637 $sth->execute;
638
639 open my $region_stats, "> results/region_stats_$proj.txt";
640 print OUT "Library\tTotal mappable\ttotal map\tLocations >0.01\tLocations >0.001\tLocations >0.0001\t All mapped Locations";
641 print $region_stats "Library,Total mappable,total map,Locations >0.01,Locations >0.001,Locations >0.0001,All mapped Locations\n";
642 while ((@row) = $sth->fetchrow_array) {
643 $count0{$row[0]} = $row[1];
644 }
645
646 foreach $item (@library) {
647 print OUT "$item\t$mappable{$item}\t$name{$item}\t$count01{$item}\t$count001{$item}\t$count0001{$item}\t$count0{$item}\n";
648 if (length($item) == 0) { next; }
649 print $region_stats "$item";
650 &print_stat_val($region_stats, \$mappable{$item});
651 &print_stat_val($region_stats, \$name{$item});
652 &print_stat_val($region_stats, \$count01{$item});
653 &print_stat_val($region_stats, \$count001{$item});
654 &print_stat_val($region_stats, \$count0001{$item});
655 &print_stat_val($region_stats, \$count0{$item});
656 print $region_stats "\n";
657 }
658 close OUT;
659 close $region_stats;
660
661 &set_project_status($proj, 1, 0);
662
663 exit(0);
664
665 sub add_default_quality {
666 print "Adding default quality values.";
667 open($input, "<", "data/seqs.tab") || die "Unable to open ${$in_fn_ref}: $!\n";
668 open($output, ">", "data/seqs_w_qual.tab") || die "Unable to open ${$out_fn_ref}: $!\n";
669 while(<$input>) {
670 chomp;
671 my @split = split("\t", $_);
672 if ($#split == 1) {
673 print $output join("\t", $split[0], "", $split[1]);
674 $split[1] =~ s/./I/g;
675 print $output sprintf("\t%s\n", $split[1]);
676 }
677 elsif ($#split == 2) {
678 print $output join("\t", $split[0], $split[1], $split[2]);
679 $split[2] =~ s/./I/g;
680 print $output sprintf("\t%s\n", $split[2]);
681 }
682 }
683 copy("data/seqs.tab", "data/original_seqs.tab") || die "Unable to move seqs.tab: $!\n";
684 copy("data/seqs_w_qual.tab", "data/seqs.tab") || die "Unable to move seqs.tab: $!\n";
685 }
686 sub alignment {
687 unless (-d "mapping") {
688 mkdir("mapping");
689 }
690 my @seq_lengths = (-1, 33, 30, 28, 24);
691 my @bowtie_mismatches = (3, 3, 2, 1, 0);
692
693 for (my $i = 0; $i <= $#seq_lengths; $i++) {
694 if($aligner eq "bwa") {
695 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i.txt");
696 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
697 }
698 elsif($aligner eq "bow_bwa") {
699 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
700 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", 1);
701 print "FINISHED Bowtie iteration #$i\n";
702 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i_part2.txt");
703 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i_part2.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i_part2.txt", $i != $#bowtie_mismatches);
704 }
705 else {
706 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
707 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
708 print "FINISHED Bowtie iteration #$i\n";
709 }
710 }
711 }
712
713 sub bowtie {
714 my ($mismatches, $input_file, $output_file, $tidy) = @_;
715 print "bowtie($mismatches, $input_file, $output_file, $tidy)\n";
716 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
717 $sth->execute;
718
719 my $cmd = "$bowtie_exe --quiet -a --best --strata -v $mismatches -m 1 --suppress 5,6,7 -f ";
720 #if ($quality) { $cmd = $cmd . "-q "; }
721 #else { $cmd = $cmd . "-f "; }
722 $cmd = $cmd . "$bowtie_idx $input_file $output_file |";
723 print "bowtie cmd: $cmd\n";
724 open($bowtie_out, $cmd);
725 close($bowtie_out);
726
727 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
728 $sth->execute;
729
730 #Clean out mapped entries for the next round of fast(a|q) generation
731 if ($tidy) {
732 $sth = $dbh->prepare("drop table if exists temp_$proj ");
733 $sth->execute;
734
735 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
736 $sth->execute;
737
738 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
739 $sth->execute;
740 }
741 }
742
743 sub bwa {
744 my ($mismatches, $input_file, $output_aln_file, $output_samse_file, $output_file, $tidy) = @_;
745 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
746 $sth->execute;
747
748 my $cmd = "$bwa_exe aln -n $mismatches -M 1 $bwa_idx $input_file > $output_aln_file | ";
749 open ($bwa_out, $cmd);
750 while(<$bwa_out>) { print $_; }
751 close($bwa_out);
752 $cmd = "$bwa_exe samse $bwa_idx $output_aln_file $input_file > $output_samse_file | ";
753 open ($bwa_out, $cmd);
754 while(<$bwa_out>) { print $_; }
755 close($bwa_out);
756
757 open ($sam, "<", $output_samse_file) || die "Unable to open samse output file, $output_samse_file: $!\n";
758 open ($final_out, ">", $output_file) || die "Unable to open final output file, $output_file: $!\n";
759 while(<$sam>) {
760 chomp;
761 if (/^@/) { next; }
762 my @sam_array = split("\t", $_);
763 if ($#sam_array < 11 || $sam_array[2] eq '*') { next; }
764 my $strand = '+';
765 if ($sam_array[1] | 0b11110111111) { $strand = '-'; }
766 my @opts = split(" ", $sam_array[11]);
767 foreach (@opts) {
768 if (/NM:i:([0-9]*)/) {
769 if ($1 > $mismatches) { next; }
770 }
771 elsif (/X0:i:([0-9]*)/) {
772 if ($1 > 1) { next; }
773 }
774 }
775 #print "CHROM: $sam_array[2]";
776 #$sam_array[2] =~ s/"SN:"//g;
777 #print "\t$sam_array[2]\n";
778 print $final_out join("\t", $sam_array[0], $strand, $chrom_map_ref->{$sam_array[2]}, $sam_array[3]-1, $sam_array[4]) . "\n";
779 }
780 close($final_out);
781 close($sam);
782
783 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
784 $sth->execute;
785
786 #Clean out mapped entries for the next round of fast(a|q) generation
787 if ($tidy) {
788 $sth = $dbh->prepare("drop table if exists temp_$proj ");
789 $sth->execute;
790
791 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
792 $sth->execute;
793
794 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
795 $sth->execute;
796 }
797 }
798
799 sub prepare_fasta {
800 my ($length, $mapping_file) = @_;
801 print "prepare_fasta($length, $mapping_file)\n";
802 if ($length <= 0) {
803 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence from illumina4dec_$proj where length(sequence) > 33");
804 }
805 else {
806 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length) from illumina4dec_$proj where length(sequence) > $length-1");
807 }
808 $sth->execute;
809
810 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
811 while ((@row) = $sth->fetchrow_array) {
812 print OUT ">$row[0]\n$row[1]\n";
813 }
814 close OUT;
815 }
816
817 sub prepare_fastq {
818 my ($length, $mapping_file) = @_;
819 if ($legnth <= 0) {
820 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence,quality from illumina4dec_$proj where length(sequence) > 33");
821 }
822 else {
823 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length),left(quality,$length) from illumina4dec_$proj where length(sequence) > $length-1");
824 }
825 $sth->execute;
826
827 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
828 while ((@row) = $sth->fetchrow_array) {
829 print OUT sprintf("\@%s\n%s\n\+%s\n%s\n", $row[0], $row[1], $row[0], substr($row[2], 0, length($row[1])));
830 }
831 close OUT;
832 }
833
834 sub print_stat_val {
835 my ($fh_ptr, $val_ptr) = @_;
836 if (defined($val_ptr) && ${$val_ptr} > 0) {
837 print $fh_ptr ",${$val_ptr}";
838 } else {
839 print $fh_ptr ",0";
840 }
841 }