view lib/TAPDANCE.pl @ 3:17ce4f3bffa2 default tip

Uploaded
author jesse-erdmann
date Tue, 24 Jan 2012 18:33:41 -0500
parents 1437a2df99c0
children
line wrap: on
line source

# Description:  Processing of transposon instertion sequences.
# Requirements perl DBI read write accessible database
# Requires a file labeled barcode.txt, and a tab delimited file contatining seqeunces.
# Should work on files of less than  40 million sequences.
# Identifies and removes barcodes, IRDR sequences and linker sequences from sequencing data, then compiles a list of non redundant sequences.
# This scripts conducts the PAPS steps 1
# Aaron Lyman Sarver July 12 2010
# version 3.1   December 1 2010

use File::Copy;
require "config.pl";

my $path = $0;
$path =~ s/\/\w*\.pl$//g;
print $path . "\n";
require "$path/util.pl";
require "$path/project_man.pl";

my $chrom_map_ref = &get_chrom_map();

print join("\t", keys %{$chrom_map_ref}) . "\n";

unless (-d "results") {
    mkdir("results");
}

#Remove old data if it exists
$sth = $dbh->prepare("drop table if exists illumina_raw_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists barcode_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina_decoded_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina_without_IRDR_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina_nr_pre_map_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina4dec_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina4dec2_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists illumina_blastout_$proj");
$sth->execute;

#1.1  Part 1 load database 
print "Started PROCESSING\n Tables identified by the postfix $proj in the database";
#Determine number of columns in seqs.tab 
open(SEQS, "<", "data/seqs.tab"); 
my @cols = split("\t", <SEQS>); 
close(SEQS);
print join(":", @cols);
print "\n"; 
if ($#cols <= 2) {
    &add_default_quality();
}
$sth = $dbh->prepare("create table illumina_raw_$proj ( id varchar(20), description varchar(20), sequence varchar(100), quality varchar(100) ) "); 
$sth->execute;

print "generated illumina_raw table\n";

$sth = $dbh->prepare("load DATA local INFILE 'data/seqs.tab' INTO TABLE illumina_raw_$proj ");
$sth->execute;

print "1.1 complete - Loaded illumina_raw &r/m data/seqS.tab file \n";

#1.2
$sth = $dbh->prepare("create table barcode_$proj ( seq varchar(20), library varchar(30), direction varchar(20) )");
$sth->execute;

print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";

$sth = $dbh->prepare("load DATA local INFILE 'data/barcode2lib.txt' INTO TABLE barcode_$proj");
$sth->execute;

print "1.2 complete - Loaded barcode from data/barcode2lib.txt\n";

#1.3
$sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print $row[0]."   Number of lines in raw data file\n";
}

$sth = $dbh->prepare("select count(distinct id) from illumina_raw_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print $row[0]."   Number of unique ids in illumina raw\n";
}
#1.4
$sth = $dbh->prepare("select count(*) from barcode_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print $row[0]."  Number of lines in barcode \n";
}

#2.1 map the barcodes to the sequences..migrated to functionin config.pl.
my $barcode_length = 0;
open (BARCODE, "<", "data/barcode2lib.txt") || die "Unable to open barcode to library file. $!\n";
while (<BARCODE>) {
    chomp;
    my @split = split("\t", $_);
    if (length($split[0]) > $barcode_length) { $barcode_length = length($split[0]); }
}
close(BARCODE);

$sth = $dbh->prepare("create table illumina_decoded_$proj select library, id, substring(sequence,$barcode_length+1) as decoded_sequence, substring(quality, $barcode_length+1) as decoded_quality from barcode_$proj,illumina_raw_$proj where sequence like concat(seq,'%')"); 
$sth->execute;
print "Mapped barcodes to sequences in table illumina_decoded2\n";
print "2.1 complete - Mapped barcodes to sequences in table illumina_decoded\n";

#2.2
$sth = $dbh->prepare("select distinct library, count(id) from illumina_decoded_$proj group by library;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print $row[0]."\t".$row[1]." inserts\n";
}

#2.3 
if (!defined($mutagens)) {
    #454
    $mutagens="__GTATGTAAACTTCCGACTTCAACTG";
    #illumina data
    #$mutagens = "TGTATGTAAACTTCCGACTTCAACTG";
    #illumina_2 data
    #$mutagens = "___TGTATGTAAACTTCCGACTTCAACTG";
    #MULV
    #$mutagens = "CCAAACCTACAGGTGGGGTCTTTCA";
}

my @mutagens_array = split(",", $mutagens);
my $first = 1;

foreach my $mutagen (@mutagens_array) {
    $mutagen =~ s/^\s+//;
    $mutagen =~ s/\s+$//;
    if ($first) {
        $sql_str = "create table ";
        $first = 0;
    } else {
        $sql_str = "insert into ";
    }
    $sql_str = join("", $sql_str, "illumina_without_IRDR_$proj select library,id,substring(decoded_sequence,", length($mutagen)+1, ") as insertion_sequence, substring(decoded_quality, ", length($mutagen)+1, ") as insertion_quality, 'good' as type from illumina_decoded_$proj where decoded_sequence like '", $mutagen, "%'");
    $sth = $dbh->prepare($sql_str);
    $sth->execute;
}
print "2.3 - Mapped and removed IRDR sequences, created table illumina_without_IRDR\n"; 

# 3.1 Find and Remove internal linker

$sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idR'  where insertion_sequence like '%GTCCCTTAAGCGGAGCC%'");
$sth->execute;
$sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence =  substring_index(insertion_sequence,'GTCCCTTAAGCGGAGCC',1) where type = 'idR';");
print "3.1 - Identified RIght linker in remaining seqeunces and renamed to idR.\n";

$sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'idL'  where insertion_sequence like '%TAGTCCCTTAAGCGGAG%'");
$sth->execute;
$sth = $dbh->prepare("update illumina_without_IRDR_$proj set insertion_sequence =  substring_index(insertion_sequence,'TAGTCCCTTAAGCGGAG',1) where type = 'idL';");
$sth->execute;
print "3.2 - Identified Left linker in remaining seqeunces and renamed to idL.\n";

# Part 3.3 Find and Remove GGATCC sequences
$sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'bam'  where insertion_sequence like '_____GGATCC%'");
$sth->execute;
print "3.3 - Identified seqeunce with GGATCC and renamed.\n";

# Part 3.4 Find and remove any remaining seqeunces that do not start with TA seqeunce
$sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'noTA'  where insertion_sequence not like 'TA%'");
$sth->execute;
print "3.4 - Identified seqeunce without TA and renamed to noTA\n";

# 3.5 Find and identify any sequences that are two short for meaningful mapping
$sth = $dbh->prepare("update illumina_without_IRDR_$proj set type = 'min'  where (type = 'good' or type = 'idR' or type = 'idL') and char_length(insertion_sequence)<24");
$sth->execute;
print "3.5 Identified short seqeunces and label them min\n";

#4.1 
$sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (type)");
$sth->execute;

$sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_sequence)"); 
$sth->execute;

#if ($quality) {
    $sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (insertion_quality)"); 
    $sth->execute;
    print "indexed table illumina_without_IRDR\n";
#}

$sth = $dbh->prepare("alter table illumina_without_IRDR_$proj add index (library)");
$sth->execute;
print "4.1 - indexed table illumina_without_IRDR\n";

####
#4.2 Remove duplicate sequences maintaining count of all
####
$sth = $dbh->prepare("create table illumina_nr_pre_map_$proj as select distinct count(type) as number, insertion_sequence, insertion_quality, library from illumina_without_IRDR_$proj where (type = 'good' or type = 'idR' or type = 'idL')  group by library, insertion_sequence, insertion_quality");
$sth->execute;

print "4.2 - Created illumina_nr_pre_map table\n";

#4.3
$sth = $dbh->prepare("create table illumina4dec_$proj (id INT unsigned not null auto_increment, primary key(ID), number varchar(20), library varchar(30),sequence varchar(100), quality varchar(100) )");
$sth->execute;
print "4.3 - Created illumina4dec table\n";

#if ($quality) {
    $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence, insertion_quality from illumina_nr_pre_map_$proj");
#}
#else {
#    $sth = $dbh->prepare("insert into illumina4dec_$proj select NULL,number,library, insertion_sequence from illumina_nr_pre_map_$proj");
#}
$sth->execute;

print "4.4 - populated nr_id table\n";


$sth = $dbh->prepare("create table illumina4dec2_$proj select * from illumina4dec_$proj");
$sth->execute;
print "4.5 - Copied illumina4dec table to illumina4dec2 for later use\n";

####
# Bowtie variable definitions
####
if (length($bowtie_exe) == 0) { $bowtie_exe = "/project/ccbioinf/Projects/labs/largaespada/Illumina_data/bowtie-0.12.5/bowtie"; }
if (length($bowtie_idx) == 0) { $bowtie_idx = "mm9"; }
if (length($envDir) == 0) { $envDir = "."; }

####
#5 This section of the script runs the bowtie algorythm.
####
&alignment();
print "5.1c - FINISHED Bowtie iteration #1\n";

$sth = $dbh->prepare("drop table if exists bowtie_$proj ");
$sth->execute;
$sth = $dbh->prepare("drop table if exists bowtie_lib_$proj ");
$sth->execute;

#####
# 6 reduce to NR set and load library and location mapping.
######
$sth = $dbh->prepare("create table bowtie_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,':',-2),':',1) as library, substring_index(substring_index(B.id,':',-1),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B");
$sth->execute;

$sth = $dbh->prepare("alter table bowtie_$proj add pos varchar(20)");
$sth->execute;
$sth = $dbh->prepare("alter table bowtie_$proj add stop varchar(20)");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set pos = size + start where strand = '-'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set pos = start + 1 where strand = '+'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set stop = start + size");
$sth->execute;

print "6.1 6.2 6.3 6.4 - created bowtie table and adjusted \n";
###
#6.5 determine transposon orientation (is it in the A or B orientation?)
###

$sth = $dbh->prepare("alter table bowtie_$proj add orient varchar(5) default 'U'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-L' and strand = '+'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-L' and strand = '-'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set orient = '-' where library like '%-R' and strand = '-'");
$sth->execute;

$sth = $dbh->prepare("update bowtie_$proj set orient = '+' where library like '%-R' and strand = '+'");
$sth->execute;



print "6.5 - determined mapping orientation\n";

#6.6
$sth = $dbh->prepare("select count(*) from temp_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print "6.6 - ".$row[0]."   Number of unique seqeunces that mapped\n";
}
#6.7
$sth = $dbh->prepare("select count(distinct chromo, start) from temp_$proj order by chromo, start+0; ");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print "6.7 - ".$row[0]."  Number of distinct genomic regions that were mapped to\n";
$nr_count = $row[0];
}

#7.1
$sth = $dbh->prepare("create table bowtie_lib_$proj select distinct library,chromo, start,stop, pos, sum(number) as count,strand, orient from bowtie_$proj group by library,chromo,start,stop,pos,strand order by chromo,start+0,orient");
$sth->execute;

print "7.1 - created bowtie_lib table\n";

$sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj");
#$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj");
$sth->execute;
#7.2
open OUT, "> results/raw_$proj.BED";
print OUT "track type ='BED' name='raw$proj' description='all$proj' visibility=2 itemRgb='On'";
while ((@row) = $sth->fetchrow_array) {
    print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
    #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
close OUT;
#7.3
$sth = $dbh->prepare("drop table if exists convert_2_hex_$proj");
$sth->execute;
$sth = $dbh->prepare("create table convert_2_hex_$proj (orient varchar(5), hex varchar(20))");
$sth->execute;
$sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"A\", \"255,0,0\")");
$sth->execute;
$sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"B\", \"0,0,255\")");
$sth->execute;
$sth = $dbh->prepare("insert into convert_2_hex_$proj VALUES(\"U\", \"0,0,0\")");
$sth->execute;

$sth = $dbh->prepare("select chromo, start, stop, concat(library,'-',count),count,strand,start,stop from bowtie_lib_$proj where count > 9");
#select chromo, start, start+1, concat(library,count),count,'+',start,start+1, hex from bowtie_lib_$proj A, convert_2_hex B where A.orient = B.orient and count > 9");
#$sth = $dbh->prepare("select chromo, start, start+1, concat(library,count),count,'+',start,start+1 from bowtie_lib_$proj where count > 9");
$sth->execute;
open OUT, "> results/raw10_$proj.BED";
print OUT "track type ='BED' name='raw10$proj' description='10$proj' visibility=2 itemRgb='On'";
while ((@row) = $sth->fetchrow_array) {
    print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t$row[8]";}
    #print OUT "\n$row[0]\t$row[1]\t$row[2]\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\t255,0,0";}
close OUT;

#8.1

$sth = $dbh->prepare("drop table if exists illumina_hist_$proj");
$sth->execute;

$sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
$sth->execute;

$sth = $dbh->prepare("create table illumina_hist_raw_$proj select library, chromo,round(pos,-2) as start, count,orient from bowtie_lib_$proj;");
$sth->execute;

$sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
$sth->execute;
$sth = $dbh->prepare("update illumina_hist_raw_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
$sth->execute;

$sth = $dbh->prepare("create table illumina_hist_$proj select distinct library, chromo,start, sum(count) as count,orient from illumina_hist_raw_$proj group by library,chromo,start,orient;");
$sth->execute;

$sth = $dbh->prepare("drop table if exists illumina_hist_raw_$proj");
$sth->execute;
$sth = $dbh->prepare("drop table if exists lib_count_$proj");
$sth->execute;
#8.2
$sth = $dbh->prepare("create table lib_count_$proj select distinct library, sum(count) as total from illumina_hist_$proj group by library;");
$sth->execute;

#8.3 the following was added to improve process. Rather then using the total mapped as the denominator in the nr ratio, the total mappable should be used. This will minimize the effect of the crossmapping problem by increasing the requirements for inclusion into the library nr lists sfor librarys wiht low percentage mappiung such as woud exist form mapping to the wrong species

$sth = $dbh->prepare("drop table if exists lib_m_$proj");
$sth->execute;
$sth = $dbh->prepare("create table lib_m_$proj select distinct library, sum(number) as total from illumina4dec2_$proj group by library");
$sth->execute;
$sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-L'");
$sth->execute;
$sth = $dbh->prepare("update lib_m_$proj set library = left(library,LENGTH(library)-2) where library like '%-R'");
$sth->execute;
$sth = $dbh->prepare("drop table if exists lib_mappable_$proj");
$sth->execute;
$sth = $dbh->prepare("create table lib_mappable_$proj select library, sum(total) as total from lib_m_$proj group by library");
$sth->execute;



print "completed mapping!!!";

###
#9 Generate a report describing the mapping.
###

open OUT, "> results/summary_$proj.txt";
print OUT "Summary of project $proj\n\nA.Report of project based counts \n";

$sth = $dbh->prepare("select count(*) from illumina_raw_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Seqeunces\n";
}

$sth = $dbh->prepare("select count(*) from illumina_decoded_$proj;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Sequences that map to a barcode\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Sequences with a barcode that also have an IRDR\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idR';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Ideal Right linker sequences\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'idL';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Ideal left linkersequences\n";
}



$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'good';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of good sequences\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'bam';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of Bamh1 sequence Artifact (removed)\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'noTA';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of sequences removed because of lack of TA sequence)\n";
}

$sth = $dbh->prepare("select count(*) from illumina_without_IRDR_$proj where type = 'min';");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total number of sequences removed because of sequence length < 24)\n";
}     




$sth = $dbh->prepare("select count(*) from illumina4dec2_$proj;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
      print OUT $row[0]."   Total number of unique seqeunces pryor to mapping\n";
}

$sth = $dbh->prepare("select sum(number) from bowtie_$proj;");
$sth->execute;
while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Total SUM  of seqeunces that mapped\n";
}

$sth = $dbh->prepare("select count(*) from bowtie_$proj;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Number of unique sequences that mapped\n";
}

$sth = $dbh->prepare("select count(*) from bowtie_lib_$proj;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Number of distinct locations mapped\n";
}

$sth = $dbh->prepare("select count(*) from illumina_hist_$proj;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       print OUT $row[0]."   Number of distinct regions mapped\n";
}
close OUT;

#report part B
open OUT, ">> results/summary_$proj.txt";
print OUT"\n\nB. Library counts associated with project $proj\n";

%sequence='';
%barcode_count='';
%IRDR_count='';
%IRDR_good='';
%IRDR_unique='';
%map_count='';
%map_total='';
%nr_count='';
@library='';

$sth = $dbh->prepare("select * from barcode_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $sequence{$row[1]} = $row[0];
       push(@library,$row[1]);
}

$sth = $dbh->prepare("select library, count(id) from illumina_decoded_$proj group by library");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $barcode_count{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select library, count(id) from illumina_without_IRDR_$proj group by library");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $IRDR_count{$row[0]} = $row[1];
}



$sth = $dbh->prepare("select library, sum(number) from illumina4dec2_$proj group by library;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $IRDR_good{$row[0]} = $row[1];
}


$sth = $dbh->prepare("select library, count(number) from illumina4dec2_$proj group by library;");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $IRDR_unique{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select library, sum(number) from bowtie_$proj group by library");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $map_count{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select library, count(id) from bowtie_$proj group by library");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $map_total{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select library, count(chromo) from bowtie_lib_$proj group by library");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $map_nr{$row[0]} = $row[1];
}

open (my $lib_stats, "> results/lib_stats_$proj.txt");
print OUT "Library\tBarcode Sequence\tBarcode\tIRDR\tMappable\tUnique mappable\tTotal map\tUnique map\tNR map\n";
print $lib_stats "Library,Barcode Sequence,with Barcode,with Mutagen,Total Mappable,Total Mapped,Mappable,Mapped,NR Inserts\n";
foreach $item (@library) {
    print OUT "$item\t$sequence{$item}\t$barcode_count{$item}\t$IRDR_count{$item}\t$IRDR_good{$item}\t$IRDR_unique{$item}\t$map_count{$item}\t$map_total{$item}\t$map_nr{$item}\n";
    if (length($item) == 0) { next; }
    print $lib_stats "$item,$sequence{$item}";
    &print_stat_val($lib_stats, \$barcode_count{$item});
    &print_stat_val($lib_stats, \$IRDR_count{$item});
    &print_stat_val($lib_stats, \$IRDR_good{$item});
    &print_stat_val($lib_stats, \$IRDR_unique{$item});
    &print_stat_val($lib_stats, \$map_count{$item});
    &print_stat_val($lib_stats, \$map_total{$item});
    &print_stat_val($lib_stats, \$map_nr{$item});
    print $lib_stats "\n";
}
close $lib_stats;

print OUT "\n\nC. Regions associated with project $proj\n";

%mappable='';
%name='';
%count01='';
%count001='';
%count0001='';
%count0='';
@library='';

$sth = $dbh->prepare("select * from lib_count_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $name{$row[0]} = $row[1];
       push(@library,$row[0]);
}

$sth = $dbh->prepare("select * from lib_mappable_$proj");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $mappable{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .01  group by A.library order by chromo,start+0");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $count01{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .001  group by A.library order by chromo,start+0");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $count001{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > .0001  group by A.library order by chromo,start+0");
$sth->execute;

while ((@row) = $sth->fetchrow_array) {
       $count0001{$row[0]} = $row[1];
}

$sth = $dbh->prepare("select A.library, count(chromo) from illumina_hist_$proj A, lib_mappable_$proj B where A.library = B.library and count/total > 0  group by A.library order by chromo,start+0");
$sth->execute;

open my $region_stats, "> results/region_stats_$proj.txt";
print OUT "Library\tTotal mappable\ttotal map\tLocations >0.01\tLocations >0.001\tLocations >0.0001\t All mapped Locations";
print $region_stats "Library,Total mappable,total map,Locations >0.01,Locations >0.001,Locations >0.0001,All mapped Locations\n";
while ((@row) = $sth->fetchrow_array) {
       $count0{$row[0]} = $row[1];
}

foreach $item (@library) {
    print OUT "$item\t$mappable{$item}\t$name{$item}\t$count01{$item}\t$count001{$item}\t$count0001{$item}\t$count0{$item}\n";
    if (length($item) == 0) { next; }
    print $region_stats "$item";
    &print_stat_val($region_stats, \$mappable{$item});
    &print_stat_val($region_stats, \$name{$item});
    &print_stat_val($region_stats, \$count01{$item});
    &print_stat_val($region_stats, \$count001{$item});
    &print_stat_val($region_stats, \$count0001{$item});
    &print_stat_val($region_stats, \$count0{$item});
    print $region_stats "\n";
}
close OUT;
close $region_stats;

&set_project_status($proj, 1, 0);

exit(0);

sub add_default_quality {
    print "Adding default quality values.";
    open($input, "<", "data/seqs.tab") || die "Unable to open ${$in_fn_ref}: $!\n";
    open($output, ">", "data/seqs_w_qual.tab") || die "Unable to open ${$out_fn_ref}: $!\n";
    while(<$input>) {
	chomp;
	my @split = split("\t", $_);
	if ($#split == 1) {
	    print $output join("\t", $split[0], "", $split[1]);
	    $split[1] =~ s/./I/g;
	    print $output sprintf("\t%s\n", $split[1]);
	}
	elsif ($#split == 2) {
	    print $output join("\t", $split[0], $split[1], $split[2]);
	    $split[2] =~ s/./I/g;
	    print $output sprintf("\t%s\n", $split[2]);
	}
    }
    copy("data/seqs.tab", "data/original_seqs.tab") || die "Unable to move seqs.tab: $!\n";
    copy("data/seqs_w_qual.tab", "data/seqs.tab") || die "Unable to move seqs.tab: $!\n";
}
sub alignment {
    unless (-d "mapping") {
	mkdir("mapping");
    }
    my @seq_lengths = (-1, 33, 30, 28, 24);
    my @bowtie_mismatches = (3, 3, 2, 1, 0);
    
    for (my $i = 0; $i <= $#seq_lengths; $i++) {
	if($aligner eq "bwa") {
	    &prepare_fastq($seq_lengths[$i], "mapping/4map_$i.txt");
	    &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
	}
	elsif($aligner eq "bow_bwa") {
	    &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
	    &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", 1);
   	    print "FINISHED Bowtie iteration #$i\n";
	    &prepare_fastq($seq_lengths[$i], "mapping/4map_$i_part2.txt");
	    &bwa($bowtie_mismatches[$i], "$envDir/mapping/4map_$i_part2.txt", "$envDir/mapping/aln$i.txt", "$envDir/mapping/samse$i.txt", "$envDir/mapping/mapping$i_part2.txt", $i != $#bowtie_mismatches);
	}
	else {
	    &prepare_fasta($seq_lengths[$i], "mapping/4map_$i.txt");
	    &bowtie($bowtie_mismatches[$i], "$envDir/mapping/4map_$i.txt", "$envDir/mapping/mapping$i.txt", $i != $#bowtie_mismatches);
	    print "FINISHED Bowtie iteration #$i\n";
	}
    }
}

sub bowtie {
    my ($mismatches, $input_file, $output_file, $tidy) = @_;
    print "bowtie($mismatches, $input_file, $output_file, $tidy)\n";
    $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
    $sth->execute;
    
    my $cmd = "$bowtie_exe --quiet -a --best --strata -v $mismatches -m 1 --suppress 5,6,7 -f ";
    #if ($quality) { $cmd = $cmd . "-q "; }
    #else { $cmd = $cmd . "-f "; }
    $cmd = $cmd . "$bowtie_idx $input_file $output_file |";
    print "bowtie cmd: $cmd\n";
    open($bowtie_out, $cmd);
    close($bowtie_out);
    
    $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
    $sth->execute;

    #Clean out mapped entries for the next round of fast(a|q) generation
    if ($tidy) {
	$sth = $dbh->prepare("drop table if exists temp_$proj ");
	$sth->execute;

	$sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
	$sth->execute;

	$sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
	$sth->execute;
    }
}

sub bwa {
    my ($mismatches, $input_file, $output_aln_file, $output_samse_file, $output_file, $tidy) = @_;
    $sth = $dbh->prepare("create table if not exists illumina_blastout_$proj ( id varchar(50),strand varchar(20),chromo varchar(20),start varchar(20),mismatch varchar(30))");
    $sth->execute;

    my $cmd = "$bwa_exe aln -n $mismatches -M 1 $bwa_idx $input_file > $output_aln_file | ";
    open ($bwa_out, $cmd);
    while(<$bwa_out>) { print $_; }
    close($bwa_out);
    $cmd = "$bwa_exe samse $bwa_idx $output_aln_file $input_file > $output_samse_file | ";
    open ($bwa_out, $cmd);
    while(<$bwa_out>) { print $_; }
    close($bwa_out);

    open ($sam, "<", $output_samse_file) || die "Unable to open samse output file, $output_samse_file: $!\n";
    open ($final_out, ">", $output_file) || die "Unable to open final output file, $output_file: $!\n";
    while(<$sam>) {
	chomp;
	if (/^@/) { next; }
	my @sam_array = split("\t", $_);
	if ($#sam_array < 11 || $sam_array[2] eq '*') { next; }
	my $strand = '+';
	if ($sam_array[1] | 0b11110111111) { $strand = '-'; }
	my @opts = split(" ", $sam_array[11]);
	foreach (@opts) {
	    if (/NM:i:([0-9]*)/) {
		if ($1 > $mismatches) { next; } 
	    }
	    elsif (/X0:i:([0-9]*)/) {
		if ($1 > 1) { next; }
	    }
	}
	#print "CHROM: $sam_array[2]";
	#$sam_array[2] =~ s/"SN:"//g;
	#print "\t$sam_array[2]\n";
	print $final_out join("\t", $sam_array[0], $strand, $chrom_map_ref->{$sam_array[2]}, $sam_array[3]-1, $sam_array[4]) . "\n"; 
    }
    close($final_out);
    close($sam);

    $sth = $dbh->prepare("load DATA local INFILE '$output_file' INTO TABLE illumina_blastout_$proj ");
    $sth->execute;

    #Clean out mapped entries for the next round of fast(a|q) generation
    if ($tidy) {
        $sth = $dbh->prepare("drop table if exists temp_$proj ");
        $sth->execute;

        $sth = $dbh->prepare("create table temp_$proj select substring_index(B.id,':',1) as id, substring_index(substring_index(B.id,'-',-2),'-',1) as number,substring_index(B.id,'-',-1) as size, strand, chromo, start, mismatch from illumina_blastout_$proj B; ");
        $sth->execute;

        $sth = $dbh->prepare("delete illumina4dec_$proj from illumina4dec_$proj,temp_$proj where illumina4dec_$proj.id = temp_$proj.id ");
        $sth->execute;
    }
}

sub prepare_fasta {
    my ($length, $mapping_file) = @_;
    print "prepare_fasta($length, $mapping_file)\n";
    if ($length <= 0) {
	$sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence from illumina4dec_$proj where length(sequence) > 33");
    }
    else {
	$sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length) from illumina4dec_$proj where length(sequence) > $length-1");
    }
    $sth->execute;

    open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
    while ((@row) = $sth->fetchrow_array) {
	print OUT ">$row[0]\n$row[1]\n";
    }
    close OUT;
}

sub prepare_fastq {
    my ($length, $mapping_file) = @_;
    if ($legnth <= 0) {
        $sth = $dbh->prepare("select concat(id,':',library,':',number,'-',char_length(sequence)),sequence,quality from illumina4dec_$proj where length(sequence) > 33");
    }
    else {
        $sth = $dbh->prepare("select concat(id,':',library,':',number,'-','$length'),left(sequence,$length),left(quality,$length) from illumina4dec_$proj where length(sequence) > $length-1");
    }
    $sth->execute;

    open (OUT, ">", $mapping_file) || die "Unable to open aligner input file, $mapping_file: $!\n";
    while ((@row) = $sth->fetchrow_array) {
        print OUT sprintf("\@%s\n%s\n\+%s\n%s\n", $row[0], $row[1], $row[0], substr($row[2], 0, length($row[1])));
    }
    close OUT;
}

sub print_stat_val {
    my ($fh_ptr, $val_ptr) = @_;
    if (defined($val_ptr) && ${$val_ptr} > 0) {
	print $fh_ptr ",${$val_ptr}";
    } else {
	print $fh_ptr ",0";
    }
}