Mercurial > repos > dongjun > mosaics
diff mosaics/inst/Perl/process_readfiles_PET.pl @ 10:d78c3c5e8ff8 draft
Uploaded
author | dongjun |
---|---|
date | Thu, 10 Jan 2013 16:01:28 -0500 |
parents | b6d0c6ceda2c |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mosaics/inst/Perl/process_readfiles_PET.pl Thu Jan 10 16:01:28 2013 -0500 @@ -0,0 +1,367 @@ +################################################################### +# +# Process read files into bin-level files (PET) +# +# Command arguments: +# - infile: Input file (directory + file name) +# - outdir: Directory of output files +# - format: File format +# - binsize: Bin size +# - collapse: Maximum # of reads allowed at each position (skip if collapse=0) +# - bychr: Construct bin-level files by chromosome? (Y or N) +# - chrinfo: Is the file for chromosome info provided? +# - chrfile: File name for chromosome info (chr size) +# - @excludeChr: Chromosomes to be excluded (vector) +# +# Supported file format: +# - eland_result, sam +# +# Note +# - chromosomes are extracted from read files, after excluding invalid lines +# - uni-reads are assumed +# +################################################################### + +#!/usr/bin/env perl; +use warnings; +use strict; +use Cwd; +use File::Basename; + +my ($infile, $outdir, $format, $binsize, $collapse, $bychr, + $chrinfo, $chrfile, @exclude_chr) = @ARGV; + +# extract only filename from $infile + +my @pr = fileparse( $infile ); +my $filename = $pr[0]; + +# remember current working directory + +my $pwd = cwd(); + +# construct bin-level data based on chromsome info, if provided + +my %bin_count = (); +my $bin_start = 0; +my $bin_stop = 0; + +if ( $chrinfo eq "Y" ) { + open CHR, "$chrfile" or die "Cannot open $chrfile\n"; + + while (<CHR>) { + chomp; + my ( $chrname, $chrsize ) = split /\s+/, $_; + + $bin_start = 0; + $bin_stop = int($chrsize/$binsize); + + for (my $i = $bin_start; $i <= $bin_stop; $i++) { + ${$bin_count{$chrname}}[$i] = 0; + } + } + + close CHR; +} + +# chromosomes to be excluded + +my $ecyn; +my %ec_hash; + +if ( scalar(@exclude_chr) == 0 ) { + $ecyn = "N"; +} else { + $ecyn = "Y"; + @ec_hash{ @exclude_chr } = 0; +} + +# process PET read file + +open IN, "$infile" or die "Cannot open input file $infile\n"; + +my %seen =(); + +my %seen_pos =(); # position (leftmost position, starting from 1) +my %seen_str = (); # strand ('F' or 'R') + +my $start; +my $end; + +while(<IN>){ + # parse + + chomp; + + # procee read file, based on "format" option + + my @parsed; + + if ( $format eq "eland_result" ) { + @parsed = eland_result( $_ ); + } elsif ( $format eq "sam" ) { + @parsed = sam( $_ ); + } else { + # Unsupported file format -> exit and return 1 to environment + + exit 1; + } + + # skip if invalid line + + next if ( $parsed[0] == 0 ); + + # otherwise, process it + + my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed; + my ($id_only, $pair) = split /\//, $id; + + # skip if chrt \in @exclude_chr + + if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) { + next; + } + + # process coordinates + + if ( exists $seen_pos{$id_only} ) { + # if other pair was already observed (w.r.t. id), + # then write the coordinates of both reads in the pair + + if ( $seen_str{$id_only} ne $str ) { + # pos1 & pos2 should have different strands and pair numbers + + my $pos1 = $seen_pos{$id_only}; + my $pos2 = $pos; + + # in the pair, write upstream pos first, + # then downstream pos + read length - 1. + # [Note] eland_result: pos = leftmost position, starting from 1. + + #if ( $pos1 < $pos2 && $str eq "R" ) { + # $pos2 += $read_length - 1; + # $start = $pos1; + # $end = $pos2; + #} elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) { + # $pos1 += $read_length - 1; + # $start = $pos2; + # $end = $pos1; + #} else { + # print "check id $id!\n"; + #} + if ( $str eq "R" ) { + $pos2 += $read_length - 1; + $start = $pos1; + $end = $pos2; + } elsif ( $seen_str{$id_only} eq "R" ) { + $pos1 += $read_length - 1; + $start = $pos2; + $end = $pos1; + } else { + print "check id $id!\n"; + } + + # process to bin-level files if collapse condition is satisfied + + my $id_collapse = join("",$chrt,$start,$end); + $seen{$id_collapse}++; + + if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) { + next; + } + + # update bin count + + if ( exists $bin_count{$chrt} ) { + # if there is already a matrix for chromosome, update it + + $bin_start = int($start/$binsize) ; + $bin_stop = int($end/$binsize) ; + for (my $i = $bin_start; $i <= $bin_stop; $i++) { + ${$bin_count{$chrt}}[$i] += $prob; + } + } else { + # if there is no matrix for chromosome yet, + + if ( $chrinfo eq "Y" ) { + # if chr info is provided, do not construct new one + + next; + } else { + # if no chr info is provided, construct one + + @{$bin_count{$chrt}} = (); + $bin_start = int($start/$binsize) ; + $bin_stop = int($end/$binsize) ; + for (my $i = $bin_start; $i <= $bin_stop; $i++) { + ${$bin_count{$chrt}}[$i] += $prob; + } + } + } + + # delete the key to save memory + + delete $seen_pos{$id_only}; + delete $seen_str{$id_only}; + } else { + print "inappropriate pair for id $id.\n"; + } + } else { + # if other pair was not observed yet, record it + + $seen_pos{$id_only} = $pos; + $seen_str{$id_only} = $str; + } +} + +close( IN ); + +# move to output directory + +chdir($outdir); + +# write bin-level files + +if ( $bychr eq "N" ) { + # genome-wide version: all chromosome in one file + + my $outfile = $filename."_bin".$binsize.".txt"; + open OUT, ">$outfile" or die "Cannot open $outfile\n"; + + foreach my $chr_id (keys %bin_count) { + my @bin_count_chr = @{$bin_count{$chr_id}}; + + for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){ + my $coord = $i*$binsize; + if ( $bin_count_chr[$i] ) { + print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n"; + } else { + print OUT "$chr_id\t$coord\t0\n"; + } + } + } + + close OUT; +} else { + # chromosome version: one chromosome in each file + + foreach my $chr_id (keys %bin_count) { + #my $outfile = $chr_id."_".$filename."_bin".$binsize.".txt"; + my $outfile = $filename."_bin".$binsize."_".$chr_id.".txt"; + open OUT, ">$outfile" or die "Cannot open $outfile\n"; + + my @bin_count_chr = @{$bin_count{$chr_id}}; + + for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){ + my $coord = $i*$binsize; + if ( $bin_count_chr[$i] ) { + print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n"; + } + else { + print OUT "$chr_id\t$coord\t0\n"; + } + } + + close OUT; + } +} + + +################################################################################ +# # +# subroutines # +# # +################################################################################ + +# -------------------------- subroutine: eland result -------------------------- + +sub eland_result { + my $line = shift; + + # parse line + + my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line; + my $read_length = length $seq; + + # exclude invalid lines + + if ( $map eq "QC" || $map eq "NM" ) { + my @status = 0; + return @status; + } + + if ( $chrt eq "QC" || $chrt eq "NM" ) { + my @status = 0; + return @status; + } + + # exclude multi-reads + + if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) { + my @status = 0; + return @status; + } + + if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) { + my @status = 0; + return @status; + } + + my @pos_sets = split( /,/, $pos ); + if ( scalar(@pos_sets) > 1 ) { + my @status = 0; + return @status; + } + + # return + + my $status = 1; + my $prob = 1; + my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob ); + return @parsed; +} + +# -------------------------- subroutine: SAM ----------------------------------- + +sub sam { + my $line = shift; + + # parse line + + if ( $line =~ /^[^@].+/ ) { + # exclude lines starting with "@" (comments) + + my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line; + $pos = int($pos); + my $read_length = length $seq; + + # strand + + my $str; + + if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) { + # exclude invalid lines + + my @status = 0; + return @status; + } elsif ( $bwflag & 16 ) { + # reverse strand + + $str = "R"; + } else { + $str = "F"; + } + + # exclude multi-reads? + + # return + + my $status = 1; + my $prob = 1; + my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob ); + return @parsed; + } else { + my @status = 0; + return @status; + } +}