sharplabtool: tools/ilmn_pacbio/assembly

annotate tools/ilmn_pacbio/assembly_stats.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

rev	line source
0 9071e359b9a3 Uploaded xuebing parents: diff changeset	1 #!/usr/bin/env python
9071e359b9a3 Uploaded xuebing parents: diff changeset	2 #
9071e359b9a3 Uploaded xuebing parents: diff changeset	3 #Copyright (c) 2011, Pacific Biosciences of California, Inc.
9071e359b9a3 Uploaded xuebing parents: diff changeset	4 #
9071e359b9a3 Uploaded xuebing parents: diff changeset	5 #All rights reserved.
9071e359b9a3 Uploaded xuebing parents: diff changeset	6 #
9071e359b9a3 Uploaded xuebing parents: diff changeset	7 #Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
9071e359b9a3 Uploaded xuebing parents: diff changeset	8 # * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9071e359b9a3 Uploaded xuebing parents: diff changeset	9 # * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9071e359b9a3 Uploaded xuebing parents: diff changeset	10 # * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9071e359b9a3 Uploaded xuebing parents: diff changeset	11 #
9071e359b9a3 Uploaded xuebing parents: diff changeset	12 #THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
9071e359b9a3 Uploaded xuebing parents: diff changeset	13 #WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
9071e359b9a3 Uploaded xuebing parents: diff changeset	14 #DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
9071e359b9a3 Uploaded xuebing parents: diff changeset	15 #LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
9071e359b9a3 Uploaded xuebing parents: diff changeset	16 #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9071e359b9a3 Uploaded xuebing parents: diff changeset	17 #
9071e359b9a3 Uploaded xuebing parents: diff changeset	18 import sys, os
9071e359b9a3 Uploaded xuebing parents: diff changeset	19 from optparse import OptionParser
9071e359b9a3 Uploaded xuebing parents: diff changeset	20 from galaxy import eggs
9071e359b9a3 Uploaded xuebing parents: diff changeset	21 import pkg_resources
9071e359b9a3 Uploaded xuebing parents: diff changeset	22 pkg_resources.require( 'bx-python' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	23 from bx.seq.fasta import FastaReader
9071e359b9a3 Uploaded xuebing parents: diff changeset	24
9071e359b9a3 Uploaded xuebing parents: diff changeset	25 def getStats( fastaFile, genomeLength, minContigLength ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	26 lengths = []
9071e359b9a3 Uploaded xuebing parents: diff changeset	27 stats = { "Num" : 0,
9071e359b9a3 Uploaded xuebing parents: diff changeset	28 "Sum" : 0,
9071e359b9a3 Uploaded xuebing parents: diff changeset	29 "Max" : 0,
9071e359b9a3 Uploaded xuebing parents: diff changeset	30 "Avg" : 0,
9071e359b9a3 Uploaded xuebing parents: diff changeset	31 "N50" : 0,
9071e359b9a3 Uploaded xuebing parents: diff changeset	32 "99%" : 0 }
9071e359b9a3 Uploaded xuebing parents: diff changeset	33 fasta_reader = FastaReader( open( fastaFile, 'rb' ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	34 while True:
9071e359b9a3 Uploaded xuebing parents: diff changeset	35 seq = fasta_reader.next()
9071e359b9a3 Uploaded xuebing parents: diff changeset	36 if not seq:
9071e359b9a3 Uploaded xuebing parents: diff changeset	37 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	38 if seq.length < minContigLength:
9071e359b9a3 Uploaded xuebing parents: diff changeset	39 continue
9071e359b9a3 Uploaded xuebing parents: diff changeset	40 lengths.append( seq.length )
9071e359b9a3 Uploaded xuebing parents: diff changeset	41 if lengths:
9071e359b9a3 Uploaded xuebing parents: diff changeset	42 stats[ 'Num' ] = len( lengths )
9071e359b9a3 Uploaded xuebing parents: diff changeset	43 stats[ 'Sum' ] = sum( lengths )
9071e359b9a3 Uploaded xuebing parents: diff changeset	44 stats[ 'Max' ] = max( lengths )
9071e359b9a3 Uploaded xuebing parents: diff changeset	45 stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	46 stats[ 'N50' ] = 0
9071e359b9a3 Uploaded xuebing parents: diff changeset	47 stats[ '99%' ] = 0
9071e359b9a3 Uploaded xuebing parents: diff changeset	48 if genomeLength == 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	49 genomeLength = sum( lengths )
9071e359b9a3 Uploaded xuebing parents: diff changeset	50 lengths.sort()
9071e359b9a3 Uploaded xuebing parents: diff changeset	51 lengths.reverse()
9071e359b9a3 Uploaded xuebing parents: diff changeset	52 lenSum = 0
9071e359b9a3 Uploaded xuebing parents: diff changeset	53 stats[ "99%" ] = len( lengths )
9071e359b9a3 Uploaded xuebing parents: diff changeset	54 for idx, length in enumerate( lengths ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	55 lenSum += length
9071e359b9a3 Uploaded xuebing parents: diff changeset	56 if ( lenSum > genomeLength / 2 ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	57 stats[ "N50" ] = length
9071e359b9a3 Uploaded xuebing parents: diff changeset	58 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	59 lenSum = 0
9071e359b9a3 Uploaded xuebing parents: diff changeset	60 for idx, length in enumerate( lengths ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	61 lenSum += length
9071e359b9a3 Uploaded xuebing parents: diff changeset	62 if lenSum > genomeLength * 0.99:
9071e359b9a3 Uploaded xuebing parents: diff changeset	63 stats[ "99%" ] = idx + 1
9071e359b9a3 Uploaded xuebing parents: diff changeset	64 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	65 return stats
9071e359b9a3 Uploaded xuebing parents: diff changeset	66
9071e359b9a3 Uploaded xuebing parents: diff changeset	67 def __main__():
9071e359b9a3 Uploaded xuebing parents: diff changeset	68 #Parse Command Line
9071e359b9a3 Uploaded xuebing parents: diff changeset	69 usage = 'Usage: %prog input output --minContigLength'
9071e359b9a3 Uploaded xuebing parents: diff changeset	70 parser = OptionParser( usage=usage )
9071e359b9a3 Uploaded xuebing parents: diff changeset	71 parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" )
9071e359b9a3 Uploaded xuebing parents: diff changeset	72 parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" )
9071e359b9a3 Uploaded xuebing parents: diff changeset	73 parser.set_defaults( minContigLength=0, genomeLength=0 )
9071e359b9a3 Uploaded xuebing parents: diff changeset	74 options, args = parser.parse_args()
9071e359b9a3 Uploaded xuebing parents: diff changeset	75 input_fasta_file = args[ 0 ]
9071e359b9a3 Uploaded xuebing parents: diff changeset	76 output_tabular_file = args[ 1 ]
9071e359b9a3 Uploaded xuebing parents: diff changeset	77 statKeys = "Num Sum Max Avg N50 99%".split( " " )
9071e359b9a3 Uploaded xuebing parents: diff changeset	78 stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	79 fout = open( output_tabular_file, "w" )
9071e359b9a3 Uploaded xuebing parents: diff changeset	80 fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	81 fout.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	82
9071e359b9a3 Uploaded xuebing parents: diff changeset	83 if __name__=="__main__": __main__()

Mercurial > repos > xuebing > sharplabtool

annotate tools/ilmn_pacbio/assembly_stats.py @ 0:9071e359b9a3