| 
11
 | 
     1 import sys, itertools, optparse
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 optParser = optparse.OptionParser( 
 | 
| 
 | 
     4    
 | 
| 
 | 
     5    usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>",
 | 
| 
 | 
     6    
 | 
| 
 | 
     7    description=
 | 
| 
 | 
     8       "This script counts how many reads in <sam_file> fall onto each exonic " +
 | 
| 
 | 
     9       "part given in <flattened_gff_file> and outputs a list of counts in " +
 | 
| 
 | 
    10       "<output_file>, for further analysis with the DEXSeq Bioconductor package. " +
 | 
| 
 | 
    11       "(Notes: The <flattened_gff_file> should be produced with the script " +
 | 
| 
 | 
    12       "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.",
 | 
| 
 | 
    13       
 | 
| 
 | 
    14    epilog = 
 | 
| 
 | 
    15       "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
 | 
| 
 | 
    16       "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
 | 
| 
 | 
    17       "Public License v3. Part of the 'DEXSeq' package." )
 | 
| 
 | 
    18       
 | 
| 
 | 
    19 optParser.add_option( "-p", "--paired", type="choice", dest="paired",
 | 
| 
 | 
    20    choices = ( "no", "yes" ), default = "no",
 | 
| 
 | 
    21    help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" )
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 optParser.add_option( "-s", "--stranded", type="choice", dest="stranded",
 | 
| 
 | 
    24    choices = ( "yes", "no", "reverse" ), default = "yes",
 | 
| 
 | 
    25    help = "'yes', 'no', or 'reverse'. Indicates whether the data is " +
 | 
| 
 | 
    26       "from a strand-specific assay (default: yes ). " +
 | 
| 
 | 
    27       "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " +
 | 
| 
 | 
    28       "preparation protocol. 'reverse' inverts strands and is neede for certain " +
 | 
| 
 | 
    29       "protocols, e.g. paired-end with circularization."  )
 | 
| 
 | 
    30    
 | 
| 
 | 
    31 optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual",
 | 
| 
 | 
    32    default = 10,
 | 
| 
 | 
    33    help = "skip all reads with alignment quality lower than the given " +
 | 
| 
 | 
    34       "minimum value (default: 10)" )
 | 
| 
 | 
    35    
 | 
| 
 | 
    36 if len( sys.argv ) == 1:
 | 
| 
 | 
    37    optParser.print_help()
 | 
| 
 | 
    38    sys.exit(1)
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 (opts, args) = optParser.parse_args()
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 if len( args ) != 3:
 | 
| 
 | 
    43    sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" )
 | 
| 
 | 
    44    sys.stderr.write( "  Call with '-h' to get usage information.\n" )
 | 
| 
 | 
    45    sys.exit( 1 )
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 try:
 | 
| 
 | 
    48    import HTSeq
 | 
| 
 | 
    49 except ImportError:
 | 
| 
 | 
    50    sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
 | 
| 
 | 
    51    sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
 | 
| 
 | 
    52    sys.exit(1)
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 gff_file = args[0]
 | 
| 
 | 
    55 sam_file = args[1]
 | 
| 
 | 
    56 out_file = args[2]
 | 
| 
 | 
    57 stranded = opts.stranded == "yes" or opts.stranded == "reverse"
 | 
| 
 | 
    58 reverse = opts.stranded == "reverse"
 | 
| 
 | 
    59 is_PE = opts.paired == "yes"
 | 
| 
 | 
    60 minaqual = opts.minaqual
 | 
| 
 | 
    61 
 | 
| 
 | 
    62 if sam_file == "-":
 | 
| 
 | 
    63    sam_file = sys.stdin
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 # Step 1: Read in the GFF file as generated by aggregate_genes.py
 | 
| 
 | 
    66 # and put everything into a GenomicArrayOfSets
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded )     
 | 
| 
 | 
    69 for f in  HTSeq.GFF_Reader( gff_file ):
 | 
| 
 | 
    70    if f.type == "exonic_part":
 | 
| 
 | 
    71       f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number']
 | 
| 
 | 
    72       features[f.iv] += f
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 # initialise counters
 | 
| 
 | 
    75 num_reads = 0
 | 
| 
 | 
    76 counts = {}
 | 
| 
 | 
    77 counts[ '_empty' ] = 0
 | 
| 
 | 
    78 counts[ '_ambiguous' ] = 0
 | 
| 
 | 
    79 counts[ '_lowaqual' ] = 0
 | 
| 
 | 
    80 counts[ '_notaligned' ] = 0
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 # put a zero for each feature ID
 | 
| 
 | 
    83 for iv, s in features.steps():
 | 
| 
 | 
    84    for f in s:
 | 
| 
 | 
    85       counts[ f.name ] = 0
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 #We need this little helper below:
 | 
| 
 | 
    88 def reverse_strand( s ):
 | 
| 
 | 
    89    if s == "+":
 | 
| 
 | 
    90       return "-"
 | 
| 
 | 
    91    elif s == "-":
 | 
| 
 | 
    92       return "+"
 | 
| 
 | 
    93    else:
 | 
| 
 | 
    94       raise SystemError, "illegal strand"
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 # Now go through the aligned reads
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 if not is_PE:
 | 
| 
 | 
    99 
 | 
| 
 | 
   100    num_reads = 0
 | 
| 
 | 
   101    for a in HTSeq.SAM_Reader( sam_file ):
 | 
| 
 | 
   102       if not a.aligned:
 | 
| 
 | 
   103          counts[ '_notaligned' ] += 1
 | 
| 
 | 
   104          continue
 | 
| 
 | 
   105       if a.aQual < minaqual:
 | 
| 
 | 
   106          counts[ '_lowaqual' ] += 1
 | 
| 
 | 
   107          continue
 | 
| 
 | 
   108       rs = set()
 | 
| 
 | 
   109       for cigop in a.cigar:
 | 
| 
 | 
   110          if cigop.type != "M":
 | 
| 
 | 
   111             continue
 | 
| 
 | 
   112          if reverse:
 | 
| 
 | 
   113             cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
 | 
| 
 | 
   114          for iv, s in features[cigop.ref_iv].steps( ):
 | 
| 
 | 
   115             rs = rs.union( s )
 | 
| 
 | 
   116       set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
 | 
| 
 | 
   117       if len( set_of_gene_names ) == 0:
 | 
| 
 | 
   118          counts[ '_empty' ] += 1
 | 
| 
 | 
   119       elif len( set_of_gene_names ) > 1:
 | 
| 
 | 
   120          counts[ '_ambiguous' ] +=1
 | 
| 
 | 
   121       else:
 | 
| 
 | 
   122          for f in rs:
 | 
| 
 | 
   123             counts[ f.name ] += 1
 | 
| 
 | 
   124       num_reads += 1
 | 
| 
 | 
   125       if num_reads % 100000 == 0:
 | 
| 
 | 
   126          sys.stdout.write( "%d reads processed.\n" % num_reads )
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 else: # paired-end
 | 
| 
 | 
   129 
 | 
| 
 | 
   130    num_reads = 0
 | 
| 
 | 
   131    for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ):
 | 
| 
 | 
   132       rs = set()
 | 
| 
 | 
   133       if af and ar and not af.aligned and not ar.aligned:
 | 
| 
 | 
   134          counts[ '_notaligned' ] += 1
 | 
| 
 | 
   135          continue
 | 
| 
 | 
   136       if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
 | 
| 
 | 
   137          counts[ '_lowaqual' ] += 1
 | 
| 
 | 
   138          continue
 | 
| 
 | 
   139       if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys():
 | 
| 
 | 
   140          for cigop in af.cigar:
 | 
| 
 | 
   141             if cigop.type != "M":
 | 
| 
 | 
   142                continue
 | 
| 
 | 
   143             if reverse:
 | 
| 
 | 
   144                cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
 | 
| 
 | 
   145             for iv, s in features[cigop.ref_iv].steps():
 | 
| 
 | 
   146                rs = rs.union( s )
 | 
| 
 | 
   147       if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys():
 | 
| 
 | 
   148          for cigop in ar.cigar:
 | 
| 
 | 
   149             if cigop.type != "M":
 | 
| 
 | 
   150                continue
 | 
| 
 | 
   151             if not reverse:
 | 
| 
 | 
   152                cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
 | 
| 
 | 
   153             for iv, s in features[cigop.ref_iv].steps():
 | 
| 
 | 
   154                   rs = rs.union( s )
 | 
| 
 | 
   155       set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
 | 
| 
 | 
   156       if len( set_of_gene_names ) == 0:
 | 
| 
 | 
   157          counts[ '_empty' ] += 1
 | 
| 
 | 
   158       elif len( set_of_gene_names ) > 1:
 | 
| 
 | 
   159          counts[ '_ambiguous' ] = 0
 | 
| 
 | 
   160       else:
 | 
| 
 | 
   161          for f in rs:
 | 
| 
 | 
   162             counts[ f.name ] += 1
 | 
| 
 | 
   163       num_reads += 1
 | 
| 
 | 
   164       if num_reads % 100000 == 0:
 | 
| 
 | 
   165          sys.stderr.write( "%d reads processed.\n" % num_reads )
 | 
| 
 | 
   166 
 | 
| 
 | 
   167  
 | 
| 
 | 
   168 # Step 3: Write out the results
 | 
| 
 | 
   169 
 | 
| 
 | 
   170 fout = open( out_file, "w" )
 | 
| 
 | 
   171 for fn in sorted( counts.keys() ):
 | 
| 
 | 
   172    fout.write( "%s\t%d\n" % ( fn, counts[fn] ) )
 | 
| 
 | 
   173 fout.close()
 |