Mercurial > repos > xuebing > sharplabtool
diff tools/sr_mapping/lastz_paired_reads_wrapper.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/sr_mapping/lastz_paired_reads_wrapper.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,847 @@ +#!/usr/bin/env python + +""" +Runs Lastz paired read alignment process +Written for Lastz v. 1.02.00. + +# Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu), +# then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu) + +This tool takes the following input: +a. A collection of 454 paired end reads ( a fasta file ) +b. A linker sequence ( a very small fasta file ) +c. A reference genome ( nob, 2bit or fasta ) + +and uses the following process: +1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three + files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary + to convert mate coordinates back into the original read, which is needed later in the process. + +2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta + and the reference genome, and the output is a SAM file, XXX.short.sam. + +3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta + and the reference genome, and the output is a SAM file, XXX.long.sam. + +4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and + XXX.long.sam, and the output is XXX.sam. + +usage: lastz_paired_reads_wrapper.py [options] + --ref_name: The reference name to change all output matches to + --ref_source: The reference is cached or from the history + --source_select: Use pre-set or cached reference file + --input1: The name of the reference file if using history or reference base name if using cached + --input2: The reads file to align + --input3: The sequencing linker file + --input4: The base quality score 454 file + --ref_sequences: The number of sequences in the reference file if using one from history + --output: The name of the output file + --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file +""" +import optparse, os, subprocess, shutil, sys, tempfile, time +from string import maketrans + +from galaxy import eggs +import pkg_resources +pkg_resources.require( 'bx-python' ) +from bx.seq.twobit import * +from bx.seq.fasta import FastaReader +from galaxy.util.bunch import Bunch +from galaxy.util import string_as_bool + +# Column indexes for SAM required fields +SAM_QNAME_COLUMN = 0 +SAM_FLAG_COLUMN = 1 +SAM_RNAME_COLUMN = 2 +SAM_POS_COLUMN = 3 +SAM_MAPQ_COLUMN = 4 +SAM_CIGAR_COLUMN = 5 +SAM_MRNM_COLUMN = 6 +SAM_MPOS_COLUMN = 7 +SAM_ISIZE_COLUMN = 8 +SAM_SEQ_COLUMN = 9 +SAM_QUAL_COLUMN = 10 +SAM_MIN_COLUMNS = 11 +# SAM bit-encoded flags +BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair +BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair +BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR +BAM_FMUNMAP = 8 # the mate is unmapped +BAM_FREVERSE = 16 # the read is mapped to the reverse strand +BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand +BAM_FREAD1 = 64 # this is read1 +BAM_FREAD2 = 128 # this is read2 +BAM_FSECONDARY = 256 # not primary alignment +BAM_FQCFAIL = 512 # QC failure +BAM_FDUP = 1024 # optical or PCR duplicate + +# Keep track of all created temporary files so they can be deleted +global tmp_file_names +tmp_file_names = [] +# The values in the skipped_lines dict are tuples consisting of: +# - the number of skipped lines for that error +# If not a sequence error: +# - the 1st line number on which the error was found +# - the text of the 1st line on which the error was found +# If a sequence error: +# - The number of the sequence in the file +# - the sequence name on which the error occurred +# We may need to improve dealing with file position and text as +# much of it comes from temporary files that are created from the +# inputs, and not the inputs themselves, so this could be confusing +# to the user. +global skipped_lines +skipped_lines = dict( bad_interval=( 0, 0, '' ), + inconsistent_read_lengths=( 0, 0, '' ), + inconsistent_reads=( 0, 0, '' ), + inconsistent_sizes=( 0, 0, '' ), + missing_mate=( 0, 0, '' ), + missing_quals=( 0, 0, '' ), + missing_seq=( 0, 0, '' ), + multiple_seqs=( 0, 0, '' ), + no_header=( 0, 0, '' ), + num_fields=( 0, 0, '' ), + reads_paired=( 0, 0, '' ), + sam_flag=( 0, 0, '' ), + sam_headers=( 0, 0, '' ), + sam_min_columns=( 0, 0, '' ), + two_mate_names=( 0, 0, '' ), + wrong_seq_len=( 0, 0, '' ) ) +global total_skipped_lines +total_skipped_lines = 0 + +def stop_err( msg ): + sys.stderr.write( "%s" % msg ) + sys.exit() + +def skip_line( error_key, position, text ): + if not skipped_lines[ error_key ][2]: + skipped_lines[ error_key ][1] = position + skipped_lines[ error_key ][2] = text + skipped_lines[ error_key ][0] += 1 + total_skipped_lines += 1 + +def get_tmp_file_name( dir=None, suffix=None ): + """ + Return a unique temporary file name that can be managed. The + file must be manually removed after use. + """ + if dir and suffix: + tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix ) + elif dir: + tmp_fd, tmp_name = tempfile.mkstemp( dir=dir ) + elif suffix: + tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix ) + os.close( tmp_fd ) + tmp_file_names.append( tmp_name ) + return tmp_name + +def run_command( command ): + proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, ) + proc.wait() + stderr = proc.stderr.read() + proc.wait() + if stderr: + stop_err( stderr ) + +def split_paired_reads( input2, combined_linker_file_name ): + """ + Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals + showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right + halves. + + The input intervals look like this. Note that they may include multiple intervals for the same read + ( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are + reported to the user, and those reads are not processed. Starts are origin zero. + + #name strand start len size + FG3OYDA05FTEES + 219 42 283 + FG3OYDA05FVOLL + 263 41 416 + FG3OYDA05FFL7J + 81 42 421 + FG3OYDA05FOQWE + 55 42 332 + FG3OYDA05FV4DW + 297 42 388 + FG3OYDA05FWAQV + 325 42 419 + FG3OYDA05FVLGA + 90 42 367 + FG3OYDA05FWJ71 + 58 42 276 + + The output gives each half-sequence on a separate line, like this. This allows easy sorting of the + sequences by length, after the fact. + + 219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC... + 22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG + """ + # Bob says these should be hard-coded. + seq_len_lower_threshold = 17 + short_mate_cutoff = 50 + # We need to pass the name of this file back to the caller. + tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' ) + mates_file = file( tmp_mates_file_name, "w+b" ) + # Read the linker intervals + combined_linker_file = file( combined_linker_file_name, "rb" ) + read_to_linker_dict = {} + i = 0 + for i, line in enumerate( combined_linker_file ): + line = line.strip() + if line.startswith( "#" ): + continue + if line.find( '#' ) >= 0: + line = line.split( "#", 1 )[0].rstrip() + fields = line.split() + if len( fields ) != 4: + skip_line( 'num_fields', i+1, line ) + continue + name, start, length, size = fields + start = int( start ) + length = int( length ) + size = int( size ) + end = start + length + if end > size: + skip_line[ 'bad_interval' ] += 1 + continue + if name not in read_to_linker_dict: + read_to_linker_dict[ name ] = ( start, end, size ) + continue + if read_to_linker_dict[ name ] == None: + # Read previously marked as non-overlapping intervals, so skip this sequence - see below + continue + ( s, e, sz ) = read_to_linker_dict[ name ] + if sz != size: + skip_line( 'inconsistent_sizes', i+1, name ) + continue + if s > end or e < start: + # Non-overlapping intervals, so skip this sequence + read_to_linker_dict[ name ] = None + continue + read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size ) + combined_linker_file.close() + # We need to pass the name of this file back to the caller. + tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' ) + mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' ) + # Process the sequences + seqs = 0 + fasta_reader = FastaReader( file( input2, 'rb' ) ) + while True: + seq = fasta_reader.next() + if not seq: + break + seqs += 1 + if seq.name not in read_to_linker_dict: + if seq.length > seq_len_lower_threshold: + mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) ) + read_to_linker_dict[ seq.name ] = "" + continue + if read_to_linker_dict[ seq.name ] == "": + skip_line( 'multiple_seqs', seqs, seq.name ) + continue + if read_to_linker_dict[ seq.name ] == None: + # Read previously marked as non-overlapping intervals, so skip this sequence - see above + continue + ( start, end, size ) = read_to_linker_dict[ seq.name ] + if seq.length != size: + skip_line( 'wrong_seq_len', seqs, seq.name ) + continue + left = seq.text[ :start ] + right = seq.text[ end: ] + left_is_small = len( left ) <= seq_len_lower_threshold + right_is_small = len( right ) <= seq_len_lower_threshold + if left_is_small and right_is_small: + continue + if not left_is_small: + mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) ) + mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) ) + if not right_is_small: + mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) ) + mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) ) + read_to_linker_dict[ seq.name ] = "" + combined_linker_file.close() + mates_file.close() + mates_mapping_file.close() + # Create temporary files for short and long mates + tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' ) + tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' ) + tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' ) + tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' ) + i = 0 + for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ): + fields = line.split() + seq_len = int( fields[0] ) + seq_name = fields[1] + seq_text = fields[2] + if seq_len <= short_mate_cutoff: + tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) + else: + tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) + tmp_mates_short.close() + tmp_mates_long.close() + return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name + +def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ): + tmp_align_file_names = [] + if ref_source == 'history': + # Reference is a fasta dataset from the history + # Create temporary files to contain the output from lastz executions + tmp_short_file_name = get_tmp_file_name( suffix='short_out' ) + tmp_align_file_names.append( tmp_short_file_name ) + tmp_long_file_name = get_tmp_file_name( suffix='long_out' ) + tmp_align_file_names.append( tmp_long_file_name ) + seqs = 0 + fasta_reader = FastaReader( open( input1 ) ) + while True: + # Read the next sequence from the reference dataset. Note that if the reference contains + # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch + # of small straggler scaffolds and contigs and it is a computational waste to do each one + # of these in its own run. There is an I/O down side to running by subsets (even if they are + # one sequence per subset), compared to splitting the reference into sizes of 250 mb. With + # the subset action, lastz still has to read and parse the entire file for every run (this + # is true for fasta, but for .2bit files it can access each sequence directly within the file, + # so the overhead is minimal). + """ + :> output_file (this creates the output file, empty) + while there are more sequences to align + find the next sequences that add up to 250M, put their names in farf.names + lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ... + >> output_file + """ + seq = fasta_reader.next() + if not seq: + break + seqs += 1 + # Create a temporary file to contain the current sequence as input to lastz. + # We're doing this a bit differently here since we could be generating a huge + # number of temporary files. + tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs ) + tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' ) + tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) ) + tmp_in_file.close() + # Align short mates + command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name ) + command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' + command += '>> %s' % tmp_short_file_name + run_command( command ) + # Align long mates + command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name ) + command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' + command += '>> %s' % tmp_long_file_name + run_command( command ) + # Remove the temporary file that contains the current sequence + os.remove( tmp_in_file_name ) + else: + # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file + tbf = TwoBitFile( open( input1, 'rb' ) ) + for chrom in tbf.keys(): + # Align short mates + tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom ) + tmp_align_file_names.append( tmp_short_file_name ) + command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name ) + command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' + command += '> %s' % tmp_short_file_name + run_command( command ) + # Align long mates + tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom ) + tmp_align_file_names.append( tmp_long_file_name ) + command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name ) + command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' + command += '> %s' % tmp_long_file_name + run_command( command ) + return tmp_align_file_names + +def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ): + """ + Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence, + convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values. + + The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read + to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L + and _R indicate the left and right mates of reads (this can be overridden with the --left and --right + options). Reads that were not mates have no suffix. + + (SAM header lines omitted) + F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT * + F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat * + F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc * + F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC * + F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt * + F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA * + F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT * + F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT * + F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA * + F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG * + F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT * + F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * + F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * + F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat * + F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT * + F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC * + F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT * + F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc * + F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA * + F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG * + F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG * + F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc * + F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG * + F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA * + + The user must provide a mapping file (which might better be called an unmapping file). This file is usually + created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single + read (this means the left and right mates were part of a single read). The mapping file contains four columns. + The first two give the mates's name (including the suffix) and the read name. The last two columns describe how + much of the full original sequence is missing from the mate. For example, in the read below, the left mate is + missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left. + + left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA + AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC + AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA + CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC + ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG + ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG + linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA + right half: CTCCTGCCTCAGCCTCCCGAG + + mate_name read_name offset_to_start offset_from_end + F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71 + F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0 + + The user can also specify a quality scores file, which should look something like this. Quality values are presumed + to be PHRED scores, written in space-delimited decimal. + + >F2YP0BU02FS4EM + 38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33 + 32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35 + 31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34 + 34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33 + 36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36 + 39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40 + 40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26 + 26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 + 40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40 + 39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35 + >F2YP0BU02G197P + 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40 + 40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40 + 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 + 40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40 + 40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40 + 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 + 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 + 40 40 40 39 39 39 40 40 + >F2YP0BU02FIING + 32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19 + 19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25 + + The output file is also SAM: + + (SAM header lines omitted) + F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII + F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@ + F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH + F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED + F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII + F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII + F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22 + F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII + F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@ + F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A + F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH + F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 + F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 + F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF + F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII + F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51 + F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF + F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7- + F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5 + F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA + F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD + F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII + F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA + F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII + + """ + left_suffix = "_L" + right_suffix = "_R" + # Read the mapping + mate_to_read_dict = {} + i = 0 + for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ): + line = line.strip() + if not line.startswith( "#" ): + fields = line.split() + if len( fields ) != 4: + skip_line( "num_fields", i+1, line ) + continue + mate_name, read_name, s_offset, e_offset = fields + if mate_name in mate_to_read_dict: + skip_line( 'two_mate_names', i+1, mate_name ) + continue + mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) ) + # Read sequence data + read_to_nucs_dict = {} + seqs = 0 + fasta_reader = FastaReader( file( input2, 'rb' ) ) + while True: + seq = fasta_reader.next() + if not seq: + break + seqs += 1 + seq_text_upper = seq.text.upper() + if seq.name in read_to_nucs_dict: + if seq_text_upper != read_to_nucs_dict[ seq.name ]: + skip_line( 'inconsistent_reads', seqs, seq.name ) + continue + read_to_nucs_dict[ seq.name ] = seq_text_upper + # Read quality data + def quality_sequences( f ): + seq_name = None + seq_quals = None + line_number = 0 + for line in f: + line_number += 1 + line = line.strip() + if line.startswith( ">" ): + if seq_name != None: + yield ( seq_name, seq_quals, seq_line ) + seq_name = sequence_name( line ) + seq_line = line_number + seq_quals = [] + elif seq_name is None: + skip_line( 'no_header', line_number, line ) + continue + else: + seq_quals += [ int( q ) for q in line.split() ] + if seq_name is not None: + yield ( seq_name, seq_quals, seq_line ) + def sequence_name( s ): + s = s[ 1: ].strip() + if not s: + return "" + else: + return s.split()[ 0 ] + read_to_quals_dict = {} + # TODO: should we use Dan's fastaNamedReader here? + for seq_name, quals, line_number in quality_sequences( file( input4 ) ): + quals = samify_phred_scores( quals ) + if seq_name in read_to_quals_dict: + if quals != read_to_quals_dict[ seq_name ]: + skip_line( 'inconsistent_reads', line_number, seq_name ) + continue + if len( quals ) != len( read_to_nucs_dict[ seq_name ] ): + skip_line( 'inconsistent_read_lengths', line_number, seq_name ) + continue + read_to_quals_dict[ seq_name ] = quals + # process the SAM file + tmp_align_file_names = ' '.join( tmp_align_file_name_list ) + combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' ) + command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name ) + run_command( command ) + fout = file( output, 'w+b' ) + has_non_header = False + i = 0 + for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ): + line = line.strip() + if line.startswith( "@" ): + if has_non_header: + skip_line( 'sam_headers', i+1, line ) + continue + fout.write( "%s\n" % line ) + continue + has_non_header = True + fields = line.split() + num_fields = len( fields ) + if num_fields < SAM_MIN_COLUMNS: + skip_line( 'sam_min_columns', i+1, line ) + continue + # Set flags for mates + try: + flag = int( fields[ SAM_FLAG_COLUMN ] ) + except ValueError: + skip_line( 'sam_flag', i+1, line ) + continue + if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ): + skip_line( 'reads_paired', i+1, line ) + continue + mate_name = fields[ SAM_QNAME_COLUMN ] + unmap_it = False + half = None + if mate_name.endswith( left_suffix ): + flag += BAM_FPAIRED + BAM_FREAD2 + fields[ SAM_FLAG_COLUMN ] = "%d" % flag + unmap_it = True + half = "L" + elif mate_name.endswith( right_suffix ): + flag += BAM_FPAIRED + BAM_FREAD1 + fields[ SAM_FLAG_COLUMN ] = "%d" % flag + unmap_it = True + half = "R" + on_plus_strand = ( flag & BAM_FREVERSE == 0 ) + # Convert position from mate to read by adding clipping to cigar + if not unmap_it: + read_name = mate_name + else: + try: + read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ] + except KeyError: + skip_line( 'missing_mate', i+1, mate_name ) + continue + cigar = fields[ SAM_CIGAR_COLUMN ] + cigar_prefix = None + cigar_suffix = None + if half == "L": + if on_plus_strand: + if s_offset > 0: + cigar_prefix = ( s_offset, "S" ) + if e_offset > 0: + cigar_suffix = ( e_offset, "H" ) + else: + if e_offset > 0: + cigar_prefix = ( e_offset, "H" ) + if s_offset > 0: + cigar_suffix = ( s_offset, "S" ) + elif half == "R": + if on_plus_strand: + if s_offset > 0: + cigar_prefix = ( s_offset, "H" ) + if e_offset > 0: + cigar_suffix = ( e_offset, "S" ) + else: + if e_offset > 0: + cigar_prefix = ( e_offset, "S" ) + if s_offset > 0: + cigar_suffix = ( s_offset, "H" ) + else: + if on_plus_strand: + if s_offset > 0: + cigar_prefix = ( s_offset, "S" ) + if e_offset > 0: + cigar_suffix = ( e_offset, "S" ) + else: + if e_offset > 0: + cigar_prefix = ( e_offset, "S" ) + if s_offset > 0: + cigar_suffix = ( s_offset, "S" ) + if cigar_prefix != None: + count, op = cigar_prefix + cigar = prefix_cigar( "%d%s" % ( count, op ), cigar ) + if op == "S": + refPos = int( fields[ SAM_POS_COLUMN ] ) - count + fields[ SAM_POS_COLUMN ] = "%d" % refPos + if cigar_suffix != None: + count, op = cigar_suffix + cigar = suffix_cigar( cigar,"%d%s" % ( count, op) ) + fields[ SAM_QNAME_COLUMN ] = read_name + fields[ SAM_CIGAR_COLUMN ] = cigar + # Fetch sequence and quality values, and flip/clip them + if read_name not in read_to_nucs_dict: + skip_line( 'missing_seq', i+1, read_name ) + continue + nucs = read_to_nucs_dict[ read_name ] + if not on_plus_strand: + nucs = reverse_complement( nucs ) + quals = None + if read_to_quals_dict != None: + if read_name not in read_to_quals_dict: + skip_line( 'missing_quals', i+1, read_name ) + continue + quals = read_to_quals_dict[ read_name ] + if not on_plus_strand: + quals = reverse_string( quals ) + cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] ) + nucs, quals = clip_for_cigar( cigar, nucs, quals ) + fields[ SAM_SEQ_COLUMN ] = nucs + if quals != None: + fields[ SAM_QUAL_COLUMN ] = quals + # Output the line + fout.write( "%s\n" % "\t".join( fields ) ) + fout.close() + +def prefix_cigar( prefix, cigar ): + ix = 0 + while cigar[ ix ].isdigit(): + ix += 1 + if cigar[ ix ] != prefix[ -1 ]: + return prefix + cigar + count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] ) + return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] ) + +def suffix_cigar( cigar, suffix ): + if cigar[ -1 ] != suffix[ -1 ]: + return cigar + suffix + ix = len( cigar ) - 2 + while cigar[ix].isdigit(): + ix -= 1 + ix += 1 + count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] ) + return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] ) + +def split_cigar( text ): + fields = [] + field = [] + for ch in text: + if ch not in "MIDHS": + field += ch + continue + if field == []: + raise ValueError + fields += [ ( int( "".join( field ) ), ch ) ] + field = [] + if field != []: + raise ValueError + return fields + +def clip_for_cigar( cigar, nucs, quals ): + # Hard clip prefix + count, op = cigar[0] + if op == "H": + nucs = nucs[ count: ] + if quals != None: + quals = quals[ count: ] + count, op = cigar[ 1 ] + # Soft clip prefix + if op == "S": + nucs = nucs[ :count ].lower() + nucs[ count: ] + # Hard clip suffix + count,op = cigar[ -1 ] + if op == "H": + nucs = nucs[ :-count ] + if quals != None: + quals = quals[ :-count ] + count, op = cigar[ -2 ] + # Soft clip suffix + if op == "S": + nucs = nucs[ :-count ] + nucs[ -count: ].lower() + return nucs, quals + +def samify_phred_scores( quals ): + """ + Convert a decimal list of phred base-quality scores to a sam quality string. + Note that if a quality is outside the dynamic range of sam's ability to + represent it, we clip the value to the max allowed. SAM quality scores + range from chr(33) to chr(126). + """ + if min( quals ) >= 0 and max( quals ) <= 126-33: + return "".join( [ chr( 33 + q ) for q in quals ] ) + else: + return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] ) + +def reverse_complement( nucs ): + complementMap = maketrans( "ACGTacgt", "TGCAtgca" ) + return nucs[ ::-1 ].translate( complementMap ) + +def reverse_string( s ): + return s[ ::-1 ] + +def __main__(): + # Parse command line + # input1: a reference genome ( 2bit or fasta ) + # input2: a collection of 454 paired end reads ( a fasta file ) + # input3: a linker sequence ( a very small fasta file ) + # input4: a base quality score 454 file ( qual454 ) + parser = optparse.OptionParser() + parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' ) + parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' ) + parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' ) + parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' ) + parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' ) + parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' ) + parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' ) + parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' ) + parser.add_option( '', '--output', dest='output', help='The output file' ) + parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' ) + ( options, args ) = parser.parse_args() + + # output version # of tool + try: + tmp = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp, 'wb' ) + proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout ) + tmp_stdout.close() + returncode = proc.wait() + stdout = None + for line in open( tmp_stdout.name, 'rb' ): + if line.lower().find( 'version' ) >= 0: + stdout = line.strip() + break + if stdout: + sys.stdout.write( '%s\n' % stdout ) + else: + raise Exception + except: + sys.stdout.write( 'Could not determine Lastz version\n' ) + + if options.ref_name: + ref_name = '[nickname=%s]' % options.ref_name + else: + ref_name = '' + if options.ref_source == 'history': + # Reference is a fasta dataset from the history + try: + # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ). + error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes." + ref_sequences = int( options.ref_sequences ) + if ref_sequences < 1: + stop_err( error_msg ) + except: + stop_err( error_msg ) + else: + ref_sequences = 0 + tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' ) + tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' ) + # Run lastz twice ( with different options ) on the linker sequence and paired end reads, + # looking for the linker ( each run finds some the other doesn't ) + command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ + ( options.input3, options.input2, tmp_w12_name ) + run_command( command ) + command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ + ( options.input3, options.input2, tmp_T1_name ) + run_command( command ) + # Combine the alignment output from the two lastz runs + tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' ) + command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name ) + run_command( command ) + # Use the alignment info to split reads into left and right mates + tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name ) + # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates() + tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ) + # Combine and convert mate coordinates back to read coordinates + paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output ) + # Delete all temporary files + for file_name in tmp_file_names: + os.remove( file_name ) + # Handle any invalid lines in the input data + if total_skipped_lines: + msgs = dict( bad_interval="Bad interval in line", + inconsistent_read_lengths="Inconsistent read/quality lengths for seq #", + inconsistent_reads="Inconsistent reads for seq #", + inconsistent_sizes="Inconsistent sizes for seq #", + missing_mate="Mapping file does not include mate on line", + missing_quals="Missing quality values for name on line", + missing_seq="Missing sequence for name on line", + multiple_seqs="Multiple names for seq #", + no_header="First quality sequence has no header", + num_fields="Must have 4 fields in line", + reads_paired="SAM flag indicates reads already paired on line", + sam_flag="Bad SAM flag on line", + sam_headers="SAM headers on line", + sam_min_columns="Need 11 columns on line", + two_mate_names="Mate name already seen, line", + wrong_seq_len="Size differs from length of seq #" ) + print "Skipped %d invalid lines: " + msg = "" + for k, v in skipped_lines.items(): + if v[0]: + # v[0] is the number of times the error occurred + # v[1] is the position of the line or sequence in the file + # v[2] is the name of the sequence or the text of the line + msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] ) + print msg + +if __name__=="__main__": __main__()