annotate lib/TAPDANCE.pl @ 3:17ce4f3bffa2 default tip

Uploaded
author jesse-erdmann
date Tue, 24 Jan 2012 18:33:41 -0500
parents 1437a2df99c0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
1 # Description: Processing of transposon instertion sequences.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
2 # Requirements perl DBI read write accessible database
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
3 # Requires a file labeled barcode.txt, and a tab delimited file contatining seqeunces.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
4 # Should work on files of less than 40 million sequences.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
5 # Identifies and removes barcodes, IRDR sequences and linker sequences from sequencing data, then compiles a list of non redundant sequences.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
6 # This scripts conducts the PAPS steps 1
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
7 # Aaron Lyman Sarver July 12 2010
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
8 # version 3.1 December 1 2010
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
9
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
10 use File::Copy;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
11 require "config.pl";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
12
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
13 my $path = $0;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
14 $path =~ s/\/\w*\.pl$//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
15 print $path . "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
16 require "$path/util.pl";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
17 require "$path/project_man.pl";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
18
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
19 my $chrom_map_ref = &get_chrom_map();
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
20
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
21 print join("\t", keys %{$chrom_map_ref}) . "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
22
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
23 unless (-d "results") {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
24 mkdir("results");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
25 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
26
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
27 #Remove old data if it exists
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
28 $sth = $dbh->prepare("drop table if exists illumina_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
29 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
30 $sth = $dbh->prepare("drop table if exists barcode_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
31 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
32 $sth = $dbh->prepare("drop table if exists illumina_decoded_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
33 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
34 $sth = $dbh->prepare("drop table if exists illumina_without_IRDR_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
35 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
36 $sth = $dbh->prepare("drop table if exists illumina_nr_pre_map_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
37 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
38 $sth = $dbh->prepare("drop table if exists illumina4dec_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
39 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
40 $sth = $dbh->prepare("drop table if exists illumina4dec2_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
41 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
42 $sth = $dbh->prepare("drop table if exists illumina_blastout_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
43 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
44
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
45 #1.1 Part 1 load database
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
46 print "Started PROCESSING\n Tables identified by the postfix $proj in the database";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
47 #Determine number of columns in seqs.tab
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
48 open(SEQS, "<", "data/seqs.tab");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
49 my @cols = split("\t", <SEQS>);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
50 close(SEQS);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
51 print join(":", @cols);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
52 print "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
53 if ($#cols <= 2) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
54 &add_default_quality();
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
55 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
56 $sth = $dbh->prepare("create table illumina_raw_$proj ( id varchar(20), description varchar(20), sequence varchar(100), quality varchar(100) ) ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
57 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
58
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
59 print "generated illumina_raw table\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
60
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
61 $sth = $dbh->prepare("load DATA local INFILE 'data/seqs.tab' INTO TABLE illumina_raw_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
62 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
63
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
64 print "1.1 complete - Loaded illumina_raw &r/m data/seqS.tab file \n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
65
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
66 #1.2
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
67 $sth = $dbh->prepare("create table barcode_$proj ( seq varchar(20), library varchar(30), direction varchar(20) )");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
68 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
69
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
70 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
71
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
72 $sth = $dbh->prepare("load DATA local INFILE 'data/barcode2lib.txt' INTO TABLE barcode_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
73 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
74
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
75 print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
76
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
77 #1.3
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
78 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
79 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
80
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
81 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
82 print $row[0]." Number of lines in raw data file\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
83 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
84
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
85 $sth = $dbh->prepare("select count(distinct id) from illumina_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
86 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
87
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
88 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
89 print $row[0]." Number of unique ids in illumina raw\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
90 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
91 #1.4
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
92 $sth = $dbh->prepare("select count(*) from barcode_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
93 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
94
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
95 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
96 print $row[0]." Number of lines in barcode \n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
97 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
98
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
99 #2.1 map the barcodes to the sequences..migrated to functionin config.pl.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
100 my $barcode_length = 0;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
101 open (BARCODE, "<", "data/barcode2lib.txt") || die "Unable to open barcode to library file. $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
102 while (<BARCODE>) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
103 chomp;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
104 my @split = split("\t", $_);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
105 if (length($split[0]) > $barcode_length) { $barcode_length = length($split[0]); }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
106 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
107 close(BARCODE);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
108
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
109 $sth = $dbh->prepare("create table illumina_decoded_$proj select library, id, substring(sequence,$barcode_length+1) as decoded_sequence, substring(quality, $barcode_length+1) as decoded_quality from barcode_$proj,illumina_raw_$proj where sequence like concat(seq,'%')");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
110 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
111 print "Mapped barcodes to sequences in table illumina_decoded2\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
112 print "2.1 complete - Mapped barcodes to sequences in table illumina_decoded\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
113
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
114 #2.2
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
115 $sth = $dbh->prepare("select distinct library, count(id) from illumina_decoded_$proj group by library;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
116 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
117
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
118 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
119 print $row[0]."\t".$row[1]." inserts\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
120 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
121
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
122 #2.3
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
123 if (!defined($mutagens)) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
124 #454
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
125 $mutagens="__GTATGTAAACTTCCGACTTCAACTG";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
126 #illumina data
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
127 #$mutagens = "TGTATGTAAACTTCCGACTTCAACTG";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
128 #illumina_2 data
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
129 #$mutagens = "___TGTATGTAAACTTCCGACTTCAACTG";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
130 #MULV
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
131 #$mutagens = "CCAAACCTACAGGTGGGGTCTTTCA";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
132 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
133
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
134 my @mutagens_array = split(",", $mutagens);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
135 my $first = 1;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
136
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
137 foreach my $mutagen (@mutagens_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
138 $mutagen =~ s/^\s+//;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
139 $mutagen =~ s/\s+$//;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
140 if ($first) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
141 $sql_str = "create table ";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
142 $first = 0;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
143 } else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
144 $sql_str = "insert into ";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
145 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
146 $sql_str = join("", $sql_str, "illumina_without_IRDR_$proj select library,id,substring(decoded_sequence,", length($mutagen)+1, ") as insertion_sequence, substring(decoded_quality, ", length($mutagen)+1, ") as insertion_quality, 'good' as type from illumina_decoded_$proj where decoded_sequence like '", $mutagen, "%'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
147 $sth = $dbh->prepare($sql_str);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
148 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
149 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
150 print "2.3 - Mapped and removed IRDR sequences, created table illumina_without_IRDR\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
151
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
152 # 3.1 Find and Remove internal linker
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
153
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
154 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idR' where insertion_sequence like '%GTCCCTTAAGCGGAGCC%'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
155 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
156 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'GTCCCTTAAGCGGAGCC',1) where type = 'idR';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
157 print "3.1 - Identified RIght linker in remaining seqeunces and renamed to idR.\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
158
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
159 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idL' where insertion_sequence like '%TAGTCCCTTAAGCGGAG%'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
160 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
161 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence = substring_index(insertion_sequence,'TAGTCCCTTAAGCGGAG',1) where type = 'idL';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
162 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
163 print "3.2 - Identified Left linker in remaining seqeunces and renamed to idL.\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
164
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
165 # Part 3.3 Find and Remove GGATCC sequences
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
166 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'bam' where insertion_sequence like '_____GGATCC%'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
167 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
168 print "3.3 - Identified seqeunce with GGATCC and renamed.\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
169
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
170 # Part 3.4 Find and remove any remaining seqeunces that do not start with TA seqeunce
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
171 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'noTA' where insertion_sequence not like 'TA%'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
172 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
173 print "3.4 - Identified seqeunce without TA and renamed to noTA\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
174
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
175 # 3.5 Find and identify any sequences that are two short for meaningful mapping
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
176 $sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'min' where (type = 'good' or type = 'idR' or type = 'idL') and char_length(insertion_sequence)<24");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
177 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
178 print "3.5 Identified short seqeunces and label them min\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
179
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
180 #4.1
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
181 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (type)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
182 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
183
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
184 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_sequence)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
185 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
186
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
187 #if ($quality) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
188 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_quality)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
189 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
190 print "indexed table illumina_without_IRDR\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
191 #}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
192
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
193 $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (library)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
194 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
195 print "4.1 - indexed table illumina_without_IRDR\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
196
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
197 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
198 #4.2 Remove duplicate sequences maintaining count of all
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
199 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
200 $sth = $dbh->prepare("create table illumina_nr_pre_map_$proj as select distinct count(type) as number, insertion_sequence, insertion_quality, library from illumina_without_IRDR_$proj where (type = 'good' or type = 'idR' or type = 'idL') group by library, insertion_sequence, insertion_quality");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
201 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
202
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
203 print "4.2 - Created illumina_nr_pre_map table\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
204
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
205 #4.3
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
206 $sth = $dbh->prepare("create table illumina4dec_$proj (id INT unsigned not null auto_increment, primary key(ID), number varchar(20), library varchar(30),sequence varchar(100), quality varchar(100) )");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
207 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
208 print "4.3 - Created illumina4dec table\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
209
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
210 #if ($quality) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
211 $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence, insertion_quality from illumina_nr_pre_map_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
212 #}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
213 #else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
214 # $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence from illumina_nr_pre_map_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
215 #}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
216 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
217
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
218 print "4.4 - populated nr_id table\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
219
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
220
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
221 $sth = $dbh->prepare("create table illumina4dec2_$proj select * from illumina4dec_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
222 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
223 print "4.5 - Copied illumina4dec table to illumina4dec2 for later use\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
224
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
225 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
226 # Bowtie variable definitions
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
227 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
228 if (length($bowtie_exe) == 0) { $bowtie_exe = "/project/ccbioinf/Projects/labs/largaespada/Illumina_data/bowtie-0.12.5/bowtie"; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
229 if (length($bowtie_idx) == 0) { $bowtie_idx = "mm9"; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
230 if (length($envDir) == 0) { $envDir = "."; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
231
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
232 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
233 #5 This section of the script runs the bowtie algorythm.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
234 ####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
235 &alignment();
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
236 print "5.1c - FINISHED Bowtie iteration #1\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
237
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
238 $sth = $dbh->prepare("drop table if exists bowtie_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
239 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
240 $sth = $dbh->prepare("drop table if exists bowtie_lib_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
241 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
242
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
243 #####
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
244 # 6 reduce to NR set and load library and location mapping.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
245 ######
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
246 $sth = $dbh->prepare("create table bowtie_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,':',-2),':',1) as library, substring_index(substring_index(B.id,':',-1),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
247 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
248
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
249 $sth = $dbh->prepare("alter table bowtie_$proj add pos varchar(20)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
250 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
251 $sth = $dbh->prepare("alter table bowtie_$proj add stop varchar(20)");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
252 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
253
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
254 $sth = $dbh->prepare("update bowtie_$proj set pos = size + start where strand = '-'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
255 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
256
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
257 $sth = $dbh->prepare("update bowtie_$proj set pos = start + 1 where strand = '+'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
258 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
259
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
260 $sth = $dbh->prepare("update bowtie_$proj set stop = start + size");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
261 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
262
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
263 print "6.1 6.2 6.3 6.4 - created bowtie table and adjusted \n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
264 ###
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
265 #6.5 determine transposon orientation (is it in the A or B orientation?)
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
266 ###
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
267
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
268 $sth = $dbh->prepare("alter table bowtie_$proj add orient varchar(5) default 'U'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
269 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
270
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
271 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-L' and strand = '+'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
272 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
273
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
274 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-L' and strand = '-'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
275 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
276
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
277 $sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-R' and strand = '-'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
278 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
279
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
280 $sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-R' and strand = '+'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
281 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
282
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
283
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
284
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
285 print "6.5 - determined mapping orientation\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
286
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
287 #6.6
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
288 $sth = $dbh->prepare("select count(*) from temp_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
289 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
290
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
291 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
292 print "6.6 - ".$row[0]." Number of unique seqeunces that mapped\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
293 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
294 #6.7
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
295 $sth = $dbh->prepare("select count(distinct chromo, start) from temp_$proj order by chromo, start+0; ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
296 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
297
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
298 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
299 print "6.7 - ".$row[0]." Number of distinct genomic regions that were mapped to\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
300 $nr_count = $row[0];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
301 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
302
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
303 #7.1
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
304 $sth = $dbh->prepare("create table bowtie_lib_$proj select distinct library,chromo, start,stop, pos, sum(number) as count,strand, orient from bowtie_$proj group by library,chromo,start,stop,pos,strand order by chromo,start+0,orient");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
305 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
306
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
307 print "7.1 - created bowtie_lib table\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
308
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
309 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
310 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
311 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
312 #7.2
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
313 open OUT, "> results/raw_$proj.BED";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
314 print OUT "track type ='BED' name='raw$proj' description='all$proj' visibility=2 itemRgb='On'";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
315 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
316 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
317 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
318 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
319 #7.3
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
320 $sth = $dbh->prepare("drop table if exists convert_2_hex_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
321 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
322 $sth = $dbh->prepare("create table convert_2_hex_$proj (orient varchar(5), hex varchar(20))");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
323 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
324 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"A\", \"255,0,0\")");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
325 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
326 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"B\", \"0,0,255\")");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
327 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
328 $sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"U\", \"0,0,0\")");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
329 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
330
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
331 $sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj where count > 9");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
332 #select chromo, start, start+1, concat(library,count),count,'+',start,start+1, hex from bowtie_lib_$proj A, convert_2_hex B where A.orient = B.orient and count > 9");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
333 #$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj where count > 9");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
334 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
335 open OUT, "> results/raw10_$proj.BED";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
336 print OUT "track type ='BED' name='raw10$proj' description='10$proj' visibility=2 itemRgb='On'";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
337 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
338 print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
339 #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
340 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
341
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
342 #8.1
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
343
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
344 $sth = $dbh->prepare("drop table if exists illumina_hist_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
345 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
346
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
347 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
348 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
349
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
350 $sth = $dbh->prepare("create table illumina_hist_raw_$proj select library, chromo,round(pos,-2) as start, count,orient from bowtie_lib_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
351 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
352
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
353 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
354 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
355 $sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
356 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
357
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
358 $sth = $dbh->prepare("create table illumina_hist_$proj select distinct library, chromo,start, sum(count) as count,orient from illumina_hist_raw_$proj group by library,chromo,start,orient;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
359 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
360
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
361 $sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
362 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
363 $sth = $dbh->prepare("drop table if exists lib_count_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
364 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
365 #8.2
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
366 $sth = $dbh->prepare("create table lib_count_$proj select distinct library, sum(count) as total from illumina_hist_$proj group by library;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
367 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
368
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
369 #8.3 the following was added to improve process. Rather then using the total mapped as the denominator in the nr ratio, the total mappable should be used. This will minimize the effect of the crossmapping problem by increasing the requirements for inclusion into the library nr lists sfor librarys wiht low percentage mappiung such as woud exist form mapping to the wrong species
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
370
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
371 $sth = $dbh->prepare("drop table if exists lib_m_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
372 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
373 $sth = $dbh->prepare("create table lib_m_$proj select distinct library, sum(number) as total from illumina4dec2_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
374 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
375 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
376 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
377 $sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
378 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
379 $sth = $dbh->prepare("drop table if exists lib_mappable_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
380 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
381 $sth = $dbh->prepare("create table lib_mappable_$proj select library, sum(total) as total from lib_m_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
382 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
383
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
384
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
385
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
386 print "completed mapping!!!";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
387
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
388 ###
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
389 #9 Generate a report describing the mapping.
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
390 ###
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
391
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
392 open OUT, "> results/summary_$proj.txt";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
393 print OUT "Summary of project $proj\n\nA.Report of project based counts \n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
394
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
395 $sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
396 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
397
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
398 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
399 print OUT $row[0]." Total number of Seqeunces\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
400 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
401
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
402 $sth = $dbh->prepare("select count(*) from illumina_decoded_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
403 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
404
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
405 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
406 print OUT $row[0]." Total number of Sequences that map to a barcode\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
407 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
408
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
409 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
410 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
411
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
412 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
413 print OUT $row[0]." Total number of Sequences with a barcode that also have an IRDR\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
414 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
415
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
416 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idR';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
417 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
418
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
419 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
420 print OUT $row[0]." Total number of Ideal Right linker sequences\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
421 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
422
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
423 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idL';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
424 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
425
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
426 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
427 print OUT $row[0]." Total number of Ideal left linkersequences\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
428 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
429
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
430
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
431
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
432 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'good';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
433 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
434
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
435 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
436 print OUT $row[0]." Total number of good sequences\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
437 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
438
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
439 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'bam';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
440 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
441
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
442 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
443 print OUT $row[0]." Total number of Bamh1 sequence Artifact (removed)\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
444 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
445
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
446 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'noTA';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
447 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
448
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
449 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
450 print OUT $row[0]." Total number of sequences removed because of lack of TA sequence)\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
451 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
452
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
453 $sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'min';");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
454 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
455
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
456 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
457 print OUT $row[0]." Total number of sequences removed because of sequence length < 24)\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
458 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
459
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
460
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
461
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
462
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
463 $sth = $dbh->prepare("select count(*) from illumina4dec2_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
464 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
465
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
466 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
467 print OUT $row[0]." Total number of unique seqeunces pryor to mapping\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
468 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
469
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
470 $sth = $dbh->prepare("select sum(number) from bowtie_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
471 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
472 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
473 print OUT $row[0]." Total SUM of seqeunces that mapped\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
474 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
475
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
476 $sth = $dbh->prepare("select count(*) from bowtie_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
477 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
478
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
479 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
480 print OUT $row[0]." Number of unique sequences that mapped\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
481 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
482
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
483 $sth = $dbh->prepare("select count(*) from bowtie_lib_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
484 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
485
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
486 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
487 print OUT $row[0]." Number of distinct locations mapped\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
488 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
489
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
490 $sth = $dbh->prepare("select count(*) from illumina_hist_$proj;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
491 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
492
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
493 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
494 print OUT $row[0]." Number of distinct regions mapped\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
495 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
496 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
497
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
498 #report part B
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
499 open OUT, ">> results/summary_$proj.txt";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
500 print OUT"\n\nB. Library counts associated with project $proj\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
501
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
502 %sequence='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
503 %barcode_count='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
504 %IRDR_count='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
505 %IRDR_good='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
506 %IRDR_unique='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
507 %map_count='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
508 %map_total='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
509 %nr_count='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
510 @library='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
511
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
512 $sth = $dbh->prepare("select * from barcode_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
513 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
514
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
515 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
516 $sequence{$row[1]} = $row[0];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
517 push(@library,$row[1]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
518 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
519
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
520 $sth = $dbh->prepare("select library, count(id) from illumina_decoded_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
521 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
522
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
523 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
524 $barcode_count{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
525 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
526
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
527 $sth = $dbh->prepare("select library, count(id) from illumina_without_IRDR_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
528 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
529
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
530 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
531 $IRDR_count{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
532 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
533
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
534
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
535
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
536 $sth = $dbh->prepare("select library, sum(number) from illumina4dec2_$proj group by library;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
537 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
538
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
539 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
540 $IRDR_good{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
541 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
542
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
543
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
544 $sth = $dbh->prepare("select library, count(number) from illumina4dec2_$proj group by library;");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
545 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
546
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
547 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
548 $IRDR_unique{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
549 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
550
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
551 $sth = $dbh->prepare("select library, sum(number) from bowtie_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
552 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
553
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
554 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
555 $map_count{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
556 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
557
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
558 $sth = $dbh->prepare("select library, count(id) from bowtie_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
559 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
560
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
561 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
562 $map_total{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
563 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
564
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
565 $sth = $dbh->prepare("select library, count(chromo) from bowtie_lib_$proj group by library");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
566 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
567
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
568 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
569 $map_nr{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
570 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
571
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
572 open (my $lib_stats, "> results/lib_stats_$proj.txt");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
573 print OUT "Library\tBarcode Sequence\tBarcode\tIRDR\tMappable\tUnique mappable\tTotal map\tUnique map\tNR map\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
574 print $lib_stats "Library,Barcode Sequence,with Barcode,with Mutagen,Total Mappable,Total Mapped,Mappable,Mapped,NR Inserts\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
575 foreach $item (@library) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
576 print OUT "$item\t$sequence{$item}\t$barcode_count{$item}\t$IRDR_count{$item}\t$IRDR_good{$item}\t$IRDR_unique{$item}\t$map_count{$item}\t$map_total{$item}\t$map_nr{$item}\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
577 if (length($item) == 0) { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
578 print $lib_stats "$item,$sequence{$item}";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
579 &print_stat_val($lib_stats, \$barcode_count{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
580 &print_stat_val($lib_stats, \$IRDR_count{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
581 &print_stat_val($lib_stats, \$IRDR_good{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
582 &print_stat_val($lib_stats, \$IRDR_unique{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
583 &print_stat_val($lib_stats, \$map_count{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
584 &print_stat_val($lib_stats, \$map_total{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
585 &print_stat_val($lib_stats, \$map_nr{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
586 print $lib_stats "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
587 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
588 close $lib_stats;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
589
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
590 print OUT "\n\nC. Regions associated with project $proj\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
591
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
592 %mappable='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
593 %name='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
594 %count01='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
595 %count001='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
596 %count0001='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
597 %count0='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
598 @library='';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
599
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
600 $sth = $dbh->prepare("select * from lib_count_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
601 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
602
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
603 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
604 $name{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
605 push(@library,$row[0]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
606 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
607
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
608 $sth = $dbh->prepare("select * from lib_mappable_$proj");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
609 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
610
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
611 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
612 $mappable{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
613 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
614
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
615 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .01 group by A.library order by chromo,start+0");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
616 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
617
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
618 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
619 $count01{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
620 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
621
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
622 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .001 group by A.library order by chromo,start+0");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
623 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
624
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
625 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
626 $count001{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
627 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
628
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
629 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .0001 group by A.library order by chromo,start+0");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
630 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
631
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
632 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
633 $count0001{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
634 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
635
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
636 $sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > 0 group by A.library order by chromo,start+0");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
637 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
638
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
639 open my $region_stats, "> results/region_stats_$proj.txt";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
640 print OUT "Library\tTotal mappable\ttotal map\tLocations >0.01\tLocations >0.001\tLocations >0.0001\t All mapped Locations";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
641 print $region_stats "Library,Total mappable,total map,Locations >0.01,Locations >0.001,Locations >0.0001,All mapped Locations\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
642 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
643 $count0{$row[0]} = $row[1];
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
644 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
645
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
646 foreach $item (@library) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
647 print OUT "$item\t$mappable{$item}\t$name{$item}\t$count01{$item}\t$count001{$item}\t$count0001{$item}\t$count0{$item}\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
648 if (length($item) == 0) { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
649 print $region_stats "$item";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
650 &print_stat_val($region_stats, \$mappable{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
651 &print_stat_val($region_stats, \$name{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
652 &print_stat_val($region_stats, \$count01{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
653 &print_stat_val($region_stats, \$count001{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
654 &print_stat_val($region_stats, \$count0001{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
655 &print_stat_val($region_stats, \$count0{$item});
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
656 print $region_stats "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
657 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
658 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
659 close $region_stats;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
660
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
661 &set_project_status($proj, 1, 0);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
662
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
663 exit(0);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
664
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
665 sub add_default_quality {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
666 print "Adding default quality values.";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
667 open($input, "<", "data/seqs.tab") || die "Unable to open ${$in_fn_ref}: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
668 open($output, ">", "data/seqs_w_qual.tab") || die "Unable to open ${$out_fn_ref}: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
669 while(<$input>) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
670 chomp;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
671 my @split = split("\t", $_);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
672 if ($#split == 1) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
673 print $output join("\t", $split[0], "", $split[1]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
674 $split[1] =~ s/./I/g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
675 print $output sprintf("\t%s\n", $split[1]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
676 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
677 elsif ($#split == 2) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
678 print $output join("\t", $split[0], $split[1], $split[2]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
679 $split[2] =~ s/./I/g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
680 print $output sprintf("\t%s\n", $split[2]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
681 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
682 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
683 copy("data/seqs.tab", "data/original_seqs.tab") || die "Unable to move seqs.tab: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
684 copy("data/seqs_w_qual.tab", "data/seqs.tab") || die "Unable to move seqs.tab: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
685 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
686 sub alignment {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
687 unless (-d "mapping") {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
688 mkdir("mapping");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
689 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
690 my @seq_lengths = (-1, 33, 30, 28, 24);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
691 my @bowtie_mismatches = (3, 3, 2, 1, 0);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
692
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
693 for (my $i = 0; $i <= $#seq_lengths; $i++) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
694 if($aligner eq "bwa") {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
695 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i.txt");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
696 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
697 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
698 elsif($aligner eq "bow_bwa") {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
699 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
700 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", 1);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
701 print "FINISHED Bowtie iteration #$i\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
702 &prepare_fastq($seq_lengths[$i], "mapping/4map_$i_part2.txt");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
703 &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i_part2.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i_part2.txt", $i != $#bowtie_mismatches);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
704 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
705 else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
706 &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
707 &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
708 print "FINISHED Bowtie iteration #$i\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
709 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
710 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
711 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
712
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
713 sub bowtie {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
714 my ($mismatches, $input_file, $output_file, $tidy) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
715 print "bowtie($mismatches, $input_file, $output_file, $tidy)\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
716 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
717 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
718
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
719 my $cmd = "$bowtie_exe --quiet -a --best --strata -v $mismatches -m 1 --suppress 5,6,7 -f ";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
720 #if ($quality) { $cmd = $cmd . "-q "; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
721 #else { $cmd = $cmd . "-f "; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
722 $cmd = $cmd . "$bowtie_idx $input_file $output_file |";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
723 print "bowtie cmd: $cmd\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
724 open($bowtie_out, $cmd);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
725 close($bowtie_out);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
726
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
727 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
728 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
729
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
730 #Clean out mapped entries for the next round of fast(a|q) generation
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
731 if ($tidy) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
732 $sth = $dbh->prepare("drop table if exists temp_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
733 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
734
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
735 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
736 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
737
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
738 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
739 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
740 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
741 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
742
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
743 sub bwa {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
744 my ($mismatches, $input_file, $output_aln_file, $output_samse_file, $output_file, $tidy) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
745 $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
746 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
747
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
748 my $cmd = "$bwa_exe aln -n $mismatches -M 1 $bwa_idx $input_file > $output_aln_file | ";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
749 open ($bwa_out, $cmd);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
750 while(<$bwa_out>) { print $_; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
751 close($bwa_out);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
752 $cmd = "$bwa_exe samse $bwa_idx $output_aln_file $input_file > $output_samse_file | ";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
753 open ($bwa_out, $cmd);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
754 while(<$bwa_out>) { print $_; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
755 close($bwa_out);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
756
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
757 open ($sam, "<", $output_samse_file) || die "Unable to open samse output file, $output_samse_file: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
758 open ($final_out, ">", $output_file) || die "Unable to open final output file, $output_file: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
759 while(<$sam>) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
760 chomp;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
761 if (/^@/) { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
762 my @sam_array = split("\t", $_);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
763 if ($#sam_array < 11 || $sam_array[2] eq '*') { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
764 my $strand = '+';
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
765 if ($sam_array[1] | 0b11110111111) { $strand = '-'; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
766 my @opts = split(" ", $sam_array[11]);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
767 foreach (@opts) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
768 if (/NM:i:([0-9]*)/) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
769 if ($1 > $mismatches) { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
770 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
771 elsif (/X0:i:([0-9]*)/) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
772 if ($1 > 1) { next; }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
773 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
774 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
775 #print "CHROM: $sam_array[2]";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
776 #$sam_array[2] =~ s/"SN:"//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
777 #print "\t$sam_array[2]\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
778 print $final_out join("\t", $sam_array[0], $strand, $chrom_map_ref->{$sam_array[2]}, $sam_array[3]-1, $sam_array[4]) . "\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
779 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
780 close($final_out);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
781 close($sam);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
782
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
783 $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
784 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
785
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
786 #Clean out mapped entries for the next round of fast(a|q) generation
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
787 if ($tidy) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
788 $sth = $dbh->prepare("drop table if exists temp_$proj ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
789 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
790
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
791 $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
792 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
793
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
794 $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
795 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
796 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
797 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
798
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
799 sub prepare_fasta {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
800 my ($length, $mapping_file) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
801 print "prepare_fasta($length, $mapping_file)\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
802 if ($length <= 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
803 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence from illumina4dec_$proj where length(sequence) > 33");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
804 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
805 else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
806 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length) from illumina4dec_$proj where length(sequence) > $length-1");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
807 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
808 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
809
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
810 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
811 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
812 print OUT ">$row[0]\n$row[1]\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
813 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
814 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
815 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
816
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
817 sub prepare_fastq {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
818 my ($length, $mapping_file) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
819 if ($legnth <= 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
820 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence,quality from illumina4dec_$proj where length(sequence) > 33");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
821 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
822 else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
823 $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length),left(quality,$length) from illumina4dec_$proj where length(sequence) > $length-1");
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
824 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
825 $sth->execute;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
826
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
827 open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
828 while ((@row) = $sth->fetchrow_array) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
829 print OUT sprintf("\@%s\n%s\n\+%s\n%s\n", $row[0], $row[1], $row[0], substr($row[2], 0, length($row[1])));
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
830 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
831 close OUT;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
832 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
833
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
834 sub print_stat_val {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
835 my ($fh_ptr, $val_ptr) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
836 if (defined($val_ptr) && ${$val_ptr} > 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
837 print $fh_ptr ",${$val_ptr}";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
838 } else {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
839 print $fh_ptr ",0";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
840 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
841 }