view mosaics/inst/Perl/process_readfiles_PET.pl @ 10:d78c3c5e8ff8 draft

Uploaded
author dongjun
date Thu, 10 Jan 2013 16:01:28 -0500
parents b6d0c6ceda2c
children
line wrap: on
line source

###################################################################
#
#   Process read files into bin-level files (PET)
#
#   Command arguments: 
#   - infile: Input file (directory + file name)
#   - outdir: Directory of output files
#   - format: File format
#   - binsize: Bin size
#   - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
#   - bychr: Construct bin-level files by chromosome? (Y or N)
#	- chrinfo: Is the file for chromosome info provided?
#	- chrfile: File name for chromosome info (chr size)
#	- @excludeChr: Chromosomes to be excluded (vector)
#
#   Supported file format: 
#   - eland_result, sam
#
#   Note
#   - chromosomes are extracted from read files, after excluding invalid lines
#   - uni-reads are assumed
#
###################################################################

#!/usr/bin/env perl;
use warnings;
use strict;
use Cwd;
use File::Basename;

my ($infile, $outdir, $format, $binsize, $collapse, $bychr, 
	$chrinfo, $chrfile, @exclude_chr) = @ARGV;

# extract only filename from $infile

my @pr = fileparse( $infile );
my $filename = $pr[0];

# remember current working directory

my $pwd = cwd();

# construct bin-level data based on chromsome info, if provided

my %bin_count = ();
my $bin_start = 0;
my $bin_stop = 0;

if ( $chrinfo eq "Y" ) {
	open CHR, "$chrfile" or die "Cannot open $chrfile\n";
	
	while (<CHR>) {
		chomp;
		my ( $chrname, $chrsize ) = split /\s+/, $_;
		
		$bin_start = 0;
		$bin_stop = int($chrsize/$binsize);
	        
		for (my $i = $bin_start; $i <= $bin_stop; $i++) {
			${$bin_count{$chrname}}[$i] = 0;
		}
	}
	
	close CHR;
}

# chromosomes to be excluded

my $ecyn;
my %ec_hash;

if ( scalar(@exclude_chr) == 0 ) {
	$ecyn = "N";
} else {
	$ecyn = "Y";
	@ec_hash{ @exclude_chr } = 0;	
}

# process PET read file

open IN, "$infile" or die "Cannot open input file $infile\n";

my %seen =();

my %seen_pos =();		# position (leftmost position, starting from 1)
my %seen_str = ();		# strand ('F' or 'R')

my $start;
my $end;

while(<IN>){
	# parse
	
	chomp;	
    
	# procee read file, based on "format" option
    
	my @parsed;
    
	if ( $format eq "eland_result" ) {
		@parsed = eland_result( $_ );
	} elsif ( $format eq "sam" ) {
		@parsed = sam( $_ );
	} else {
        	# Unsupported file format -> exit and return 1 to environment
        
        	exit 1;
	}
    
	# skip if invalid line
    
	next if ( $parsed[0] == 0 );
    
	# otherwise, process it
    
	my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed;
	my ($id_only, $pair) = split /\//, $id;
    
	# skip if chrt \in @exclude_chr
    
	if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
		next;
	}
	
	# process coordinates
	
	if ( exists $seen_pos{$id_only} ) {		
		# if other pair was already observed (w.r.t. id), 
		# then write the coordinates of both reads in the pair
		
		if ( $seen_str{$id_only} ne $str ) {
			# pos1 & pos2 should have different strands and pair numbers
			
			my $pos1 = $seen_pos{$id_only};
			my $pos2 = $pos;
			
			# in the pair, write upstream pos first, 
			# then downstream pos + read length - 1.
			# [Note] eland_result: pos = leftmost position, starting from 1.
			
			#if ( $pos1 < $pos2 && $str eq "R" ) {
			#	$pos2 += $read_length - 1;
			#	$start = $pos1;
			#	$end = $pos2;
			#} elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) {
			#	$pos1 += $read_length - 1;
			#	$start = $pos2;
			#	$end = $pos1;
			#} else {
			#	print "check id $id!\n";
			#}
			if ( $str eq "R" ) {
				$pos2 += $read_length - 1;
				$start = $pos1;
				$end = $pos2;
			} elsif ( $seen_str{$id_only} eq "R" ) {
				$pos1 += $read_length - 1;
				$start = $pos2;
				$end = $pos1;
			} else {
				print "check id $id!\n";
			}
    
		    # process to bin-level files if collapse condition is satisfied
		    
		    my $id_collapse = join("",$chrt,$start,$end);
		    $seen{$id_collapse}++;
		    
		    if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
		        next;   
		    }
    
			# update bin count
    
			if ( exists $bin_count{$chrt} ) {
				# if there is already a matrix for chromosome, update it
        
				$bin_start = int($start/$binsize) ;
				$bin_stop = int($end/$binsize) ;
				for (my $i = $bin_start; $i <= $bin_stop; $i++) {
					${$bin_count{$chrt}}[$i] += $prob;
				}
			} else {
				# if there is no matrix for chromosome yet, 
        
				if ( $chrinfo eq "Y" ) {
					# if chr info is provided, do not construct new one
	        
					next;
				} else {
					# if no chr info is provided, construct one
	        
					@{$bin_count{$chrt}} = ();
					$bin_start = int($start/$binsize) ;
					$bin_stop = int($end/$binsize) ;
					for (my $i = $bin_start; $i <= $bin_stop; $i++) {
						${$bin_count{$chrt}}[$i] += $prob;
					}
				}
			}
			
			# delete the key to save memory
			
			delete $seen_pos{$id_only};
			delete $seen_str{$id_only};
		} else {
			print "inappropriate pair for id $id.\n";
		}
	} else {
		# if other pair was not observed yet, record it
		
		$seen_pos{$id_only} = $pos;
		$seen_str{$id_only} = $str;
	}
}

close( IN );

# move to output directory

chdir($outdir);

# write bin-level files

if ( $bychr eq "N" ) {
    # genome-wide version: all chromosome in one file
    
    my $outfile = $filename."_bin".$binsize.".txt";
    open OUT, ">$outfile" or die "Cannot open $outfile\n";
    
    foreach my $chr_id (keys %bin_count) {      
        my @bin_count_chr = @{$bin_count{$chr_id}};
        
        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
            my $coord = $i*$binsize;
            if ( $bin_count_chr[$i] ) {
                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
            } else {
                print OUT "$chr_id\t$coord\t0\n";
            }
        }       
    }
    
    close OUT;
} else {
    # chromosome version: one chromosome in each file
    
    foreach my $chr_id (keys %bin_count) {
        #my $outfile = $chr_id."_".$filename."_bin".$binsize.".txt";
        my $outfile = $filename."_bin".$binsize."_".$chr_id.".txt";
        open OUT, ">$outfile" or die "Cannot open $outfile\n";
        
        my @bin_count_chr = @{$bin_count{$chr_id}};
        
        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
            my $coord = $i*$binsize;
            if ( $bin_count_chr[$i] ) {
                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
            }
            else {
                print OUT "$chr_id\t$coord\t0\n";
            }
        }
        
        close OUT;
    }
}


################################################################################
#                                                                              #
#                              subroutines                                     #
#                                                                              #
################################################################################

# -------------------------- subroutine: eland result -------------------------- 

sub eland_result {
    my $line = shift;  
    
    # parse line    
    
    my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
    my $read_length = length $seq;
    
    # exclude invalid lines
    
    if ( $map eq "QC" || $map eq "NM" ) {
        my @status = 0;
        return @status;
    }
    
    if ( $chrt eq "QC" || $chrt eq "NM" ) {
        my @status = 0;
        return @status;
    }
    
    # exclude multi-reads
    
    if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
        my @status = 0;
        return @status;
    }
    
    if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
        my @status = 0;
        return @status;
    }
    
    my @pos_sets = split( /,/, $pos );
    if ( scalar(@pos_sets) > 1 ) {
        my @status = 0;
        return @status;
    }
    
    # return
    
    my $status = 1;
    my $prob = 1;
    my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
    return @parsed;
}

# -------------------------- subroutine: SAM -----------------------------------

sub sam {
    my $line = shift;   
    
    # parse line
    
    if ( $line =~ /^[^@].+/ ) { 
        # exclude lines starting with "@" (comments)
        
        my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
        $pos = int($pos);
        my $read_length = length $seq;
        
        # strand
        
        my $str;
        
        if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
            # exclude invalid lines
                
            my @status = 0;
            return @status;
        } elsif ( $bwflag & 16 ) {
            # reverse strand
            
            $str = "R";
        } else {
            $str = "F";
        }   
    
        # exclude multi-reads?
        
        # return
        
        my $status = 1;
    	my $prob = 1;
        my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
        return @parsed;         
    } else {
        my @status = 0;
        return @status; 
    }
}