Mercurial > repos > vipints > deseq_hts
comparison dexseq-hts_1.0/src/dexseq_count.py @ 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
| author | vipints <vipin@cbio.mskcc.org> |
|---|---|
| date | Tue, 08 Oct 2013 08:22:45 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:2fe512c7bfdf | 11:cec4b4fb30be |
|---|---|
| 1 import sys, itertools, optparse | |
| 2 | |
| 3 optParser = optparse.OptionParser( | |
| 4 | |
| 5 usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>", | |
| 6 | |
| 7 description= | |
| 8 "This script counts how many reads in <sam_file> fall onto each exonic " + | |
| 9 "part given in <flattened_gff_file> and outputs a list of counts in " + | |
| 10 "<output_file>, for further analysis with the DEXSeq Bioconductor package. " + | |
| 11 "(Notes: The <flattened_gff_file> should be produced with the script " + | |
| 12 "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.", | |
| 13 | |
| 14 epilog = | |
| 15 "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + | |
| 16 "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + | |
| 17 "Public License v3. Part of the 'DEXSeq' package." ) | |
| 18 | |
| 19 optParser.add_option( "-p", "--paired", type="choice", dest="paired", | |
| 20 choices = ( "no", "yes" ), default = "no", | |
| 21 help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) | |
| 22 | |
| 23 optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", | |
| 24 choices = ( "yes", "no", "reverse" ), default = "yes", | |
| 25 help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + | |
| 26 "from a strand-specific assay (default: yes ). " + | |
| 27 "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + | |
| 28 "preparation protocol. 'reverse' inverts strands and is neede for certain " + | |
| 29 "protocols, e.g. paired-end with circularization." ) | |
| 30 | |
| 31 optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", | |
| 32 default = 10, | |
| 33 help = "skip all reads with alignment quality lower than the given " + | |
| 34 "minimum value (default: 10)" ) | |
| 35 | |
| 36 if len( sys.argv ) == 1: | |
| 37 optParser.print_help() | |
| 38 sys.exit(1) | |
| 39 | |
| 40 (opts, args) = optParser.parse_args() | |
| 41 | |
| 42 if len( args ) != 3: | |
| 43 sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) | |
| 44 sys.stderr.write( " Call with '-h' to get usage information.\n" ) | |
| 45 sys.exit( 1 ) | |
| 46 | |
| 47 try: | |
| 48 import HTSeq | |
| 49 except ImportError: | |
| 50 sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) | |
| 51 sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) | |
| 52 sys.exit(1) | |
| 53 | |
| 54 gff_file = args[0] | |
| 55 sam_file = args[1] | |
| 56 out_file = args[2] | |
| 57 stranded = opts.stranded == "yes" or opts.stranded == "reverse" | |
| 58 reverse = opts.stranded == "reverse" | |
| 59 is_PE = opts.paired == "yes" | |
| 60 minaqual = opts.minaqual | |
| 61 | |
| 62 if sam_file == "-": | |
| 63 sam_file = sys.stdin | |
| 64 | |
| 65 # Step 1: Read in the GFF file as generated by aggregate_genes.py | |
| 66 # and put everything into a GenomicArrayOfSets | |
| 67 | |
| 68 features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) | |
| 69 for f in HTSeq.GFF_Reader( gff_file ): | |
| 70 if f.type == "exonic_part": | |
| 71 f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] | |
| 72 features[f.iv] += f | |
| 73 | |
| 74 # initialise counters | |
| 75 num_reads = 0 | |
| 76 counts = {} | |
| 77 counts[ '_empty' ] = 0 | |
| 78 counts[ '_ambiguous' ] = 0 | |
| 79 counts[ '_lowaqual' ] = 0 | |
| 80 counts[ '_notaligned' ] = 0 | |
| 81 | |
| 82 # put a zero for each feature ID | |
| 83 for iv, s in features.steps(): | |
| 84 for f in s: | |
| 85 counts[ f.name ] = 0 | |
| 86 | |
| 87 #We need this little helper below: | |
| 88 def reverse_strand( s ): | |
| 89 if s == "+": | |
| 90 return "-" | |
| 91 elif s == "-": | |
| 92 return "+" | |
| 93 else: | |
| 94 raise SystemError, "illegal strand" | |
| 95 | |
| 96 # Now go through the aligned reads | |
| 97 | |
| 98 if not is_PE: | |
| 99 | |
| 100 num_reads = 0 | |
| 101 for a in HTSeq.SAM_Reader( sam_file ): | |
| 102 if not a.aligned: | |
| 103 counts[ '_notaligned' ] += 1 | |
| 104 continue | |
| 105 if a.aQual < minaqual: | |
| 106 counts[ '_lowaqual' ] += 1 | |
| 107 continue | |
| 108 rs = set() | |
| 109 for cigop in a.cigar: | |
| 110 if cigop.type != "M": | |
| 111 continue | |
| 112 if reverse: | |
| 113 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
| 114 for iv, s in features[cigop.ref_iv].steps( ): | |
| 115 rs = rs.union( s ) | |
| 116 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) | |
| 117 if len( set_of_gene_names ) == 0: | |
| 118 counts[ '_empty' ] += 1 | |
| 119 elif len( set_of_gene_names ) > 1: | |
| 120 counts[ '_ambiguous' ] +=1 | |
| 121 else: | |
| 122 for f in rs: | |
| 123 counts[ f.name ] += 1 | |
| 124 num_reads += 1 | |
| 125 if num_reads % 100000 == 0: | |
| 126 sys.stdout.write( "%d reads processed.\n" % num_reads ) | |
| 127 | |
| 128 else: # paired-end | |
| 129 | |
| 130 num_reads = 0 | |
| 131 for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): | |
| 132 rs = set() | |
| 133 if af and ar and not af.aligned and not ar.aligned: | |
| 134 counts[ '_notaligned' ] += 1 | |
| 135 continue | |
| 136 if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: | |
| 137 counts[ '_lowaqual' ] += 1 | |
| 138 continue | |
| 139 if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): | |
| 140 for cigop in af.cigar: | |
| 141 if cigop.type != "M": | |
| 142 continue | |
| 143 if reverse: | |
| 144 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
| 145 for iv, s in features[cigop.ref_iv].steps(): | |
| 146 rs = rs.union( s ) | |
| 147 if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys(): | |
| 148 for cigop in ar.cigar: | |
| 149 if cigop.type != "M": | |
| 150 continue | |
| 151 if not reverse: | |
| 152 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
| 153 for iv, s in features[cigop.ref_iv].steps(): | |
| 154 rs = rs.union( s ) | |
| 155 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) | |
| 156 if len( set_of_gene_names ) == 0: | |
| 157 counts[ '_empty' ] += 1 | |
| 158 elif len( set_of_gene_names ) > 1: | |
| 159 counts[ '_ambiguous' ] = 0 | |
| 160 else: | |
| 161 for f in rs: | |
| 162 counts[ f.name ] += 1 | |
| 163 num_reads += 1 | |
| 164 if num_reads % 100000 == 0: | |
| 165 sys.stderr.write( "%d reads processed.\n" % num_reads ) | |
| 166 | |
| 167 | |
| 168 # Step 3: Write out the results | |
| 169 | |
| 170 fout = open( out_file, "w" ) | |
| 171 for fn in sorted( counts.keys() ): | |
| 172 fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) | |
| 173 fout.close() |
