Mercurial > repos > xuebing > sharplabtool
view tools/sr_mapping/lastz_paired_reads_wrapper.py @ 1:cdcb0ce84a1b
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:15 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
#!/usr/bin/env python """ Runs Lastz paired read alignment process Written for Lastz v. 1.02.00. # Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu), # then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu) This tool takes the following input: a. A collection of 454 paired end reads ( a fasta file ) b. A linker sequence ( a very small fasta file ) c. A reference genome ( nob, 2bit or fasta ) and uses the following process: 1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary to convert mate coordinates back into the original read, which is needed later in the process. 2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta and the reference genome, and the output is a SAM file, XXX.short.sam. 3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta and the reference genome, and the output is a SAM file, XXX.long.sam. 4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and XXX.long.sam, and the output is XXX.sam. usage: lastz_paired_reads_wrapper.py [options] --ref_name: The reference name to change all output matches to --ref_source: The reference is cached or from the history --source_select: Use pre-set or cached reference file --input1: The name of the reference file if using history or reference base name if using cached --input2: The reads file to align --input3: The sequencing linker file --input4: The base quality score 454 file --ref_sequences: The number of sequences in the reference file if using one from history --output: The name of the output file --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file """ import optparse, os, subprocess, shutil, sys, tempfile, time from string import maketrans from galaxy import eggs import pkg_resources pkg_resources.require( 'bx-python' ) from bx.seq.twobit import * from bx.seq.fasta import FastaReader from galaxy.util.bunch import Bunch from galaxy.util import string_as_bool # Column indexes for SAM required fields SAM_QNAME_COLUMN = 0 SAM_FLAG_COLUMN = 1 SAM_RNAME_COLUMN = 2 SAM_POS_COLUMN = 3 SAM_MAPQ_COLUMN = 4 SAM_CIGAR_COLUMN = 5 SAM_MRNM_COLUMN = 6 SAM_MPOS_COLUMN = 7 SAM_ISIZE_COLUMN = 8 SAM_SEQ_COLUMN = 9 SAM_QUAL_COLUMN = 10 SAM_MIN_COLUMNS = 11 # SAM bit-encoded flags BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR BAM_FMUNMAP = 8 # the mate is unmapped BAM_FREVERSE = 16 # the read is mapped to the reverse strand BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand BAM_FREAD1 = 64 # this is read1 BAM_FREAD2 = 128 # this is read2 BAM_FSECONDARY = 256 # not primary alignment BAM_FQCFAIL = 512 # QC failure BAM_FDUP = 1024 # optical or PCR duplicate # Keep track of all created temporary files so they can be deleted global tmp_file_names tmp_file_names = [] # The values in the skipped_lines dict are tuples consisting of: # - the number of skipped lines for that error # If not a sequence error: # - the 1st line number on which the error was found # - the text of the 1st line on which the error was found # If a sequence error: # - The number of the sequence in the file # - the sequence name on which the error occurred # We may need to improve dealing with file position and text as # much of it comes from temporary files that are created from the # inputs, and not the inputs themselves, so this could be confusing # to the user. global skipped_lines skipped_lines = dict( bad_interval=( 0, 0, '' ), inconsistent_read_lengths=( 0, 0, '' ), inconsistent_reads=( 0, 0, '' ), inconsistent_sizes=( 0, 0, '' ), missing_mate=( 0, 0, '' ), missing_quals=( 0, 0, '' ), missing_seq=( 0, 0, '' ), multiple_seqs=( 0, 0, '' ), no_header=( 0, 0, '' ), num_fields=( 0, 0, '' ), reads_paired=( 0, 0, '' ), sam_flag=( 0, 0, '' ), sam_headers=( 0, 0, '' ), sam_min_columns=( 0, 0, '' ), two_mate_names=( 0, 0, '' ), wrong_seq_len=( 0, 0, '' ) ) global total_skipped_lines total_skipped_lines = 0 def stop_err( msg ): sys.stderr.write( "%s" % msg ) sys.exit() def skip_line( error_key, position, text ): if not skipped_lines[ error_key ][2]: skipped_lines[ error_key ][1] = position skipped_lines[ error_key ][2] = text skipped_lines[ error_key ][0] += 1 total_skipped_lines += 1 def get_tmp_file_name( dir=None, suffix=None ): """ Return a unique temporary file name that can be managed. The file must be manually removed after use. """ if dir and suffix: tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix ) elif dir: tmp_fd, tmp_name = tempfile.mkstemp( dir=dir ) elif suffix: tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix ) os.close( tmp_fd ) tmp_file_names.append( tmp_name ) return tmp_name def run_command( command ): proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, ) proc.wait() stderr = proc.stderr.read() proc.wait() if stderr: stop_err( stderr ) def split_paired_reads( input2, combined_linker_file_name ): """ Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right halves. The input intervals look like this. Note that they may include multiple intervals for the same read ( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are reported to the user, and those reads are not processed. Starts are origin zero. #name strand start len size FG3OYDA05FTEES + 219 42 283 FG3OYDA05FVOLL + 263 41 416 FG3OYDA05FFL7J + 81 42 421 FG3OYDA05FOQWE + 55 42 332 FG3OYDA05FV4DW + 297 42 388 FG3OYDA05FWAQV + 325 42 419 FG3OYDA05FVLGA + 90 42 367 FG3OYDA05FWJ71 + 58 42 276 The output gives each half-sequence on a separate line, like this. This allows easy sorting of the sequences by length, after the fact. 219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC... 22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG """ # Bob says these should be hard-coded. seq_len_lower_threshold = 17 short_mate_cutoff = 50 # We need to pass the name of this file back to the caller. tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' ) mates_file = file( tmp_mates_file_name, "w+b" ) # Read the linker intervals combined_linker_file = file( combined_linker_file_name, "rb" ) read_to_linker_dict = {} i = 0 for i, line in enumerate( combined_linker_file ): line = line.strip() if line.startswith( "#" ): continue if line.find( '#' ) >= 0: line = line.split( "#", 1 )[0].rstrip() fields = line.split() if len( fields ) != 4: skip_line( 'num_fields', i+1, line ) continue name, start, length, size = fields start = int( start ) length = int( length ) size = int( size ) end = start + length if end > size: skip_line[ 'bad_interval' ] += 1 continue if name not in read_to_linker_dict: read_to_linker_dict[ name ] = ( start, end, size ) continue if read_to_linker_dict[ name ] == None: # Read previously marked as non-overlapping intervals, so skip this sequence - see below continue ( s, e, sz ) = read_to_linker_dict[ name ] if sz != size: skip_line( 'inconsistent_sizes', i+1, name ) continue if s > end or e < start: # Non-overlapping intervals, so skip this sequence read_to_linker_dict[ name ] = None continue read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size ) combined_linker_file.close() # We need to pass the name of this file back to the caller. tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' ) mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' ) # Process the sequences seqs = 0 fasta_reader = FastaReader( file( input2, 'rb' ) ) while True: seq = fasta_reader.next() if not seq: break seqs += 1 if seq.name not in read_to_linker_dict: if seq.length > seq_len_lower_threshold: mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) ) read_to_linker_dict[ seq.name ] = "" continue if read_to_linker_dict[ seq.name ] == "": skip_line( 'multiple_seqs', seqs, seq.name ) continue if read_to_linker_dict[ seq.name ] == None: # Read previously marked as non-overlapping intervals, so skip this sequence - see above continue ( start, end, size ) = read_to_linker_dict[ seq.name ] if seq.length != size: skip_line( 'wrong_seq_len', seqs, seq.name ) continue left = seq.text[ :start ] right = seq.text[ end: ] left_is_small = len( left ) <= seq_len_lower_threshold right_is_small = len( right ) <= seq_len_lower_threshold if left_is_small and right_is_small: continue if not left_is_small: mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) ) mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) ) if not right_is_small: mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) ) mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) ) read_to_linker_dict[ seq.name ] = "" combined_linker_file.close() mates_file.close() mates_mapping_file.close() # Create temporary files for short and long mates tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' ) tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' ) tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' ) tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' ) i = 0 for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ): fields = line.split() seq_len = int( fields[0] ) seq_name = fields[1] seq_text = fields[2] if seq_len <= short_mate_cutoff: tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) else: tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) tmp_mates_short.close() tmp_mates_long.close() return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ): tmp_align_file_names = [] if ref_source == 'history': # Reference is a fasta dataset from the history # Create temporary files to contain the output from lastz executions tmp_short_file_name = get_tmp_file_name( suffix='short_out' ) tmp_align_file_names.append( tmp_short_file_name ) tmp_long_file_name = get_tmp_file_name( suffix='long_out' ) tmp_align_file_names.append( tmp_long_file_name ) seqs = 0 fasta_reader = FastaReader( open( input1 ) ) while True: # Read the next sequence from the reference dataset. Note that if the reference contains # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch # of small straggler scaffolds and contigs and it is a computational waste to do each one # of these in its own run. There is an I/O down side to running by subsets (even if they are # one sequence per subset), compared to splitting the reference into sizes of 250 mb. With # the subset action, lastz still has to read and parse the entire file for every run (this # is true for fasta, but for .2bit files it can access each sequence directly within the file, # so the overhead is minimal). """ :> output_file (this creates the output file, empty) while there are more sequences to align find the next sequences that add up to 250M, put their names in farf.names lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ... >> output_file """ seq = fasta_reader.next() if not seq: break seqs += 1 # Create a temporary file to contain the current sequence as input to lastz. # We're doing this a bit differently here since we could be generating a huge # number of temporary files. tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs ) tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' ) tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) ) tmp_in_file.close() # Align short mates command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name ) command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' command += '>> %s' % tmp_short_file_name run_command( command ) # Align long mates command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name ) command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' command += '>> %s' % tmp_long_file_name run_command( command ) # Remove the temporary file that contains the current sequence os.remove( tmp_in_file_name ) else: # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file tbf = TwoBitFile( open( input1, 'rb' ) ) for chrom in tbf.keys(): # Align short mates tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom ) tmp_align_file_names.append( tmp_short_file_name ) command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name ) command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' command += '> %s' % tmp_short_file_name run_command( command ) # Align long mates tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom ) tmp_align_file_names.append( tmp_long_file_name ) command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name ) command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' command += '> %s' % tmp_long_file_name run_command( command ) return tmp_align_file_names def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ): """ Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence, convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values. The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L and _R indicate the left and right mates of reads (this can be overridden with the --left and --right options). Reads that were not mates have no suffix. (SAM header lines omitted) F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT * F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat * F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc * F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC * F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt * F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA * F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT * F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT * F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA * F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG * F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT * F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat * F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT * F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC * F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT * F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc * F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA * F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG * F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG * F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc * F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG * F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA * The user must provide a mapping file (which might better be called an unmapping file). This file is usually created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single read (this means the left and right mates were part of a single read). The mapping file contains four columns. The first two give the mates's name (including the suffix) and the read name. The last two columns describe how much of the full original sequence is missing from the mate. For example, in the read below, the left mate is missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left. left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA right half: CTCCTGCCTCAGCCTCCCGAG mate_name read_name offset_to_start offset_from_end F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71 F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0 The user can also specify a quality scores file, which should look something like this. Quality values are presumed to be PHRED scores, written in space-delimited decimal. >F2YP0BU02FS4EM 38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33 32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35 31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34 34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33 36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36 39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40 40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26 26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40 39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35 >F2YP0BU02G197P 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40 40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 >F2YP0BU02FIING 32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19 19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25 The output file is also SAM: (SAM header lines omitted) F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@ F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22 F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@ F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51 F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7- F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5 F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII """ left_suffix = "_L" right_suffix = "_R" # Read the mapping mate_to_read_dict = {} i = 0 for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ): line = line.strip() if not line.startswith( "#" ): fields = line.split() if len( fields ) != 4: skip_line( "num_fields", i+1, line ) continue mate_name, read_name, s_offset, e_offset = fields if mate_name in mate_to_read_dict: skip_line( 'two_mate_names', i+1, mate_name ) continue mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) ) # Read sequence data read_to_nucs_dict = {} seqs = 0 fasta_reader = FastaReader( file( input2, 'rb' ) ) while True: seq = fasta_reader.next() if not seq: break seqs += 1 seq_text_upper = seq.text.upper() if seq.name in read_to_nucs_dict: if seq_text_upper != read_to_nucs_dict[ seq.name ]: skip_line( 'inconsistent_reads', seqs, seq.name ) continue read_to_nucs_dict[ seq.name ] = seq_text_upper # Read quality data def quality_sequences( f ): seq_name = None seq_quals = None line_number = 0 for line in f: line_number += 1 line = line.strip() if line.startswith( ">" ): if seq_name != None: yield ( seq_name, seq_quals, seq_line ) seq_name = sequence_name( line ) seq_line = line_number seq_quals = [] elif seq_name is None: skip_line( 'no_header', line_number, line ) continue else: seq_quals += [ int( q ) for q in line.split() ] if seq_name is not None: yield ( seq_name, seq_quals, seq_line ) def sequence_name( s ): s = s[ 1: ].strip() if not s: return "" else: return s.split()[ 0 ] read_to_quals_dict = {} # TODO: should we use Dan's fastaNamedReader here? for seq_name, quals, line_number in quality_sequences( file( input4 ) ): quals = samify_phred_scores( quals ) if seq_name in read_to_quals_dict: if quals != read_to_quals_dict[ seq_name ]: skip_line( 'inconsistent_reads', line_number, seq_name ) continue if len( quals ) != len( read_to_nucs_dict[ seq_name ] ): skip_line( 'inconsistent_read_lengths', line_number, seq_name ) continue read_to_quals_dict[ seq_name ] = quals # process the SAM file tmp_align_file_names = ' '.join( tmp_align_file_name_list ) combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' ) command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name ) run_command( command ) fout = file( output, 'w+b' ) has_non_header = False i = 0 for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ): line = line.strip() if line.startswith( "@" ): if has_non_header: skip_line( 'sam_headers', i+1, line ) continue fout.write( "%s\n" % line ) continue has_non_header = True fields = line.split() num_fields = len( fields ) if num_fields < SAM_MIN_COLUMNS: skip_line( 'sam_min_columns', i+1, line ) continue # Set flags for mates try: flag = int( fields[ SAM_FLAG_COLUMN ] ) except ValueError: skip_line( 'sam_flag', i+1, line ) continue if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ): skip_line( 'reads_paired', i+1, line ) continue mate_name = fields[ SAM_QNAME_COLUMN ] unmap_it = False half = None if mate_name.endswith( left_suffix ): flag += BAM_FPAIRED + BAM_FREAD2 fields[ SAM_FLAG_COLUMN ] = "%d" % flag unmap_it = True half = "L" elif mate_name.endswith( right_suffix ): flag += BAM_FPAIRED + BAM_FREAD1 fields[ SAM_FLAG_COLUMN ] = "%d" % flag unmap_it = True half = "R" on_plus_strand = ( flag & BAM_FREVERSE == 0 ) # Convert position from mate to read by adding clipping to cigar if not unmap_it: read_name = mate_name else: try: read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ] except KeyError: skip_line( 'missing_mate', i+1, mate_name ) continue cigar = fields[ SAM_CIGAR_COLUMN ] cigar_prefix = None cigar_suffix = None if half == "L": if on_plus_strand: if s_offset > 0: cigar_prefix = ( s_offset, "S" ) if e_offset > 0: cigar_suffix = ( e_offset, "H" ) else: if e_offset > 0: cigar_prefix = ( e_offset, "H" ) if s_offset > 0: cigar_suffix = ( s_offset, "S" ) elif half == "R": if on_plus_strand: if s_offset > 0: cigar_prefix = ( s_offset, "H" ) if e_offset > 0: cigar_suffix = ( e_offset, "S" ) else: if e_offset > 0: cigar_prefix = ( e_offset, "S" ) if s_offset > 0: cigar_suffix = ( s_offset, "H" ) else: if on_plus_strand: if s_offset > 0: cigar_prefix = ( s_offset, "S" ) if e_offset > 0: cigar_suffix = ( e_offset, "S" ) else: if e_offset > 0: cigar_prefix = ( e_offset, "S" ) if s_offset > 0: cigar_suffix = ( s_offset, "S" ) if cigar_prefix != None: count, op = cigar_prefix cigar = prefix_cigar( "%d%s" % ( count, op ), cigar ) if op == "S": refPos = int( fields[ SAM_POS_COLUMN ] ) - count fields[ SAM_POS_COLUMN ] = "%d" % refPos if cigar_suffix != None: count, op = cigar_suffix cigar = suffix_cigar( cigar,"%d%s" % ( count, op) ) fields[ SAM_QNAME_COLUMN ] = read_name fields[ SAM_CIGAR_COLUMN ] = cigar # Fetch sequence and quality values, and flip/clip them if read_name not in read_to_nucs_dict: skip_line( 'missing_seq', i+1, read_name ) continue nucs = read_to_nucs_dict[ read_name ] if not on_plus_strand: nucs = reverse_complement( nucs ) quals = None if read_to_quals_dict != None: if read_name not in read_to_quals_dict: skip_line( 'missing_quals', i+1, read_name ) continue quals = read_to_quals_dict[ read_name ] if not on_plus_strand: quals = reverse_string( quals ) cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] ) nucs, quals = clip_for_cigar( cigar, nucs, quals ) fields[ SAM_SEQ_COLUMN ] = nucs if quals != None: fields[ SAM_QUAL_COLUMN ] = quals # Output the line fout.write( "%s\n" % "\t".join( fields ) ) fout.close() def prefix_cigar( prefix, cigar ): ix = 0 while cigar[ ix ].isdigit(): ix += 1 if cigar[ ix ] != prefix[ -1 ]: return prefix + cigar count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] ) return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] ) def suffix_cigar( cigar, suffix ): if cigar[ -1 ] != suffix[ -1 ]: return cigar + suffix ix = len( cigar ) - 2 while cigar[ix].isdigit(): ix -= 1 ix += 1 count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] ) return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] ) def split_cigar( text ): fields = [] field = [] for ch in text: if ch not in "MIDHS": field += ch continue if field == []: raise ValueError fields += [ ( int( "".join( field ) ), ch ) ] field = [] if field != []: raise ValueError return fields def clip_for_cigar( cigar, nucs, quals ): # Hard clip prefix count, op = cigar[0] if op == "H": nucs = nucs[ count: ] if quals != None: quals = quals[ count: ] count, op = cigar[ 1 ] # Soft clip prefix if op == "S": nucs = nucs[ :count ].lower() + nucs[ count: ] # Hard clip suffix count,op = cigar[ -1 ] if op == "H": nucs = nucs[ :-count ] if quals != None: quals = quals[ :-count ] count, op = cigar[ -2 ] # Soft clip suffix if op == "S": nucs = nucs[ :-count ] + nucs[ -count: ].lower() return nucs, quals def samify_phred_scores( quals ): """ Convert a decimal list of phred base-quality scores to a sam quality string. Note that if a quality is outside the dynamic range of sam's ability to represent it, we clip the value to the max allowed. SAM quality scores range from chr(33) to chr(126). """ if min( quals ) >= 0 and max( quals ) <= 126-33: return "".join( [ chr( 33 + q ) for q in quals ] ) else: return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] ) def reverse_complement( nucs ): complementMap = maketrans( "ACGTacgt", "TGCAtgca" ) return nucs[ ::-1 ].translate( complementMap ) def reverse_string( s ): return s[ ::-1 ] def __main__(): # Parse command line # input1: a reference genome ( 2bit or fasta ) # input2: a collection of 454 paired end reads ( a fasta file ) # input3: a linker sequence ( a very small fasta file ) # input4: a base quality score 454 file ( qual454 ) parser = optparse.OptionParser() parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' ) parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' ) parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' ) parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' ) parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' ) parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' ) parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' ) parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' ) parser.add_option( '', '--output', dest='output', help='The output file' ) parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' ) ( options, args ) = parser.parse_args() # output version # of tool try: tmp = tempfile.NamedTemporaryFile().name tmp_stdout = open( tmp, 'wb' ) proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout ) tmp_stdout.close() returncode = proc.wait() stdout = None for line in open( tmp_stdout.name, 'rb' ): if line.lower().find( 'version' ) >= 0: stdout = line.strip() break if stdout: sys.stdout.write( '%s\n' % stdout ) else: raise Exception except: sys.stdout.write( 'Could not determine Lastz version\n' ) if options.ref_name: ref_name = '[nickname=%s]' % options.ref_name else: ref_name = '' if options.ref_source == 'history': # Reference is a fasta dataset from the history try: # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ). error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes." ref_sequences = int( options.ref_sequences ) if ref_sequences < 1: stop_err( error_msg ) except: stop_err( error_msg ) else: ref_sequences = 0 tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' ) tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' ) # Run lastz twice ( with different options ) on the linker sequence and paired end reads, # looking for the linker ( each run finds some the other doesn't ) command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ ( options.input3, options.input2, tmp_w12_name ) run_command( command ) command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ ( options.input3, options.input2, tmp_T1_name ) run_command( command ) # Combine the alignment output from the two lastz runs tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' ) command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name ) run_command( command ) # Use the alignment info to split reads into left and right mates tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name ) # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates() tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ) # Combine and convert mate coordinates back to read coordinates paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output ) # Delete all temporary files for file_name in tmp_file_names: os.remove( file_name ) # Handle any invalid lines in the input data if total_skipped_lines: msgs = dict( bad_interval="Bad interval in line", inconsistent_read_lengths="Inconsistent read/quality lengths for seq #", inconsistent_reads="Inconsistent reads for seq #", inconsistent_sizes="Inconsistent sizes for seq #", missing_mate="Mapping file does not include mate on line", missing_quals="Missing quality values for name on line", missing_seq="Missing sequence for name on line", multiple_seqs="Multiple names for seq #", no_header="First quality sequence has no header", num_fields="Must have 4 fields in line", reads_paired="SAM flag indicates reads already paired on line", sam_flag="Bad SAM flag on line", sam_headers="SAM headers on line", sam_min_columns="Need 11 columns on line", two_mate_names="Mate name already seen, line", wrong_seq_len="Size differs from length of seq #" ) print "Skipped %d invalid lines: " msg = "" for k, v in skipped_lines.items(): if v[0]: # v[0] is the number of times the error occurred # v[1] is the position of the line or sequence in the file # v[2] is the name of the sequence or the text of the line msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] ) print msg if __name__=="__main__": __main__()