comparison filter.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 #!/usr/bin/env python
2 ## category General
3 ## desc Removes reads from a BAM file based on criteria
4 """
5 Removes reads from a BAM file based on criteria
6
7 Given a BAM file, this script will only allow reads that meet filtering
8 criteria to be written to output. The output is another BAM file with the
9 reads not matching the criteria removed.
10
11 Note: this does not adjust tag values reflecting any filtering. (for example:
12 if a read mapped to two locations (IH:i:2), and one was removed by
13 filtering, the IH:i tag would still read IH:i:2).
14
15 Currently, the available filters are:
16 -minlen val Remove reads that are smaller than {val}
17 -maxlen val Remove reads that are larger than {val}
18 -mapped Keep only mapped reads
19 -unmapped Keep only unmapped reads
20 -properpair Keep only properly paired reads (both mapped,
21 correct orientation, flag set in BAM)
22 -noproperpair Keep only not-properly paired reads
23
24 -mask bitmask Remove reads that match the mask (base 10/hex)
25 -uniq {length} Remove reads that are have the same sequence
26 Note: BAM file should be sorted
27 (up to an optional length)
28 -uniq_start Remove reads that start at the same position
29 Note: BAM file should be sorted
30 (Use only for low-coverage samples)
31
32 -mismatch num # mismatches or indels
33 indel always counts as 1 regardless of length
34 (requires NM tag in reads)
35
36 -mismatch_dbsnp num dbsnp.txt.bgz
37 # mismatches or indels - not in dbSNP.
38 Variations are called using the MD tag.
39 Variations that are found in the dbSNP list are
40 not counted as mismatches. The dbSNP list is a
41 Tabix-indexed dump of dbSNP (from UCSC Genome
42 Browser). Indels in dbSNP are also counted.
43 Adds a 'ZS:i' tag with the number of found SNPs
44 in the read.
45 (requires NM and MD tags)
46
47 Example command for indexing:
48 ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0
49
50 -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches
51 directly in a reference FASTA file
52 (use if NM tag not present)
53
54 -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz
55 # mismatches or indels - looks up mismatches
56 directly from a reference FASTA file. (See
57 -mismatch_dbsnp for dbSNP matching)
58 (use if NM or MD tag not present)
59
60 -nosecondary Remove reads that have the 0x100 flag set
61 -noqcfail Remove reads that have the 0x200 flag set
62 -nopcrdup Remove reads that have the 0x400 flag set
63
64
65 -exclude ref:start-end Remove reads in this region (1-based start)
66 -excludebed file.bed {nostrand}
67 Remove reads that are in any of the regions
68 from the given BED file. If 'nostrand' is given,
69 strand information from the BED file is ignored.
70
71 -include ref:start-end Remove reads NOT in the region (can only be one)
72 -includebed file.bed {nostrand}
73 Remove reads that are NOT any of the regions
74 from the given BED file. If 'nostrand' is given,
75 strand information from the BED file is ignored.
76
77 Note: If this is a large dataset, use
78 "bamutils extract" instead.
79
80 -includeref refname Exclude reads NOT mapped to a reference
81 -excluderef refname Exclude reads mapped to a particular reference
82 (e.g. chrM, or _dup chromosomes)
83
84 -whitelist fname Remove reads that aren't on this list (by name)
85 -blacklist fname Remove reads that are on this list (by name)
86 These lists can be whitespace-delimited with
87 the read name as the first column.
88 -maximum_mismatch_ratio val
89 Filter by maximum mismatch ratio (fraction of length)
90
91 -eq tag_name value
92 -lt tag_name value
93 -lte tag_name value
94 -gt tag_name value
95 -gte tag_name value
96
97 As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ
98 value will be used.
99
100 Common tags to filter by:
101 AS Alignment score
102 IH Number of alignments
103 NM Edit distance (each indel counts as many as its length)
104
105 MAPQ Mapping quality (defined as part of SAM spec)
106
107 The tag type (:i, :f, :Z) is optional.
108
109 """
110
111 import os
112 import sys
113 import pysam
114 from ngsutils.bam import bam_iter
115 from ngsutils.support.dbsnp import DBSNP
116 from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations
117 from ngsutils.bed import BedFile
118
119
120 def usage():
121 print __doc__
122 print """
123 Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...
124
125 Options:
126 -failed fname A text file containing the read names of all reads
127 that were removed with filtering
128
129 Example:
130 bamutils filter filename.bam output.bam -mapped -gte AS:i 1000
131
132 This will remove all unmapped reads, as well as any reads that have an AS:i
133 value less than 1000.
134 """
135 sys.exit(1)
136
137
138 class Unique(object):
139 def __init__(self, length=None):
140 if length:
141 self.length = int(length)
142 else:
143 self.length = None
144
145 self.last_pos = None
146 self.pos_reads = set()
147
148 def __repr__(self):
149 return "uniq"
150
151 def filter(self, bam, read):
152 if self.last_pos != (read.tid, read.pos):
153 self.last_pos = (read.tid, read.pos)
154 self.pos_reads = set()
155
156 if read.is_reverse:
157 seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order
158 else:
159 seq = read.seq
160
161 if self.length:
162 seq = seq[:self.length]
163
164 if seq in self.pos_reads:
165 return False
166
167 self.pos_reads.add(seq)
168 return True
169
170 def close(self):
171 pass
172
173
174 class UniqueStart(object):
175 def __init__(self):
176 self.last_tid = None
177 self.last_fwd_pos = -1
178 self.rev_pos_list = None
179
180 def __repr__(self):
181 return "uniq_start"
182
183 def filter(self, bam, read):
184 if self.last_tid is None or self.last_tid != read.tid:
185 self.last_tid = read.tid
186 self.rev_pos = set()
187 self.last_fwd_pos = -1
188 self.last_rev_pos = -1
189
190 if read.is_reverse:
191 # check reverse reads from their start (3' aend)
192 # these aren't necessarily in the correct
193 # order in the file, so we have to track them in a set
194
195 start_pos = read.aend
196
197 # clean up hash if necesary
198 # Allow up to 100k over, to balance cleaning up the rev_pos hash
199 # and memory
200 if read.pos > (self.last_rev_pos + 100000):
201 self.last_rev_pos = start_pos
202 del_list = []
203 for k in self.rev_pos:
204 if k < read.pos:
205 del_list.append(k)
206
207 for k in del_list:
208 self.rev_pos.remove(k)
209
210 if not start_pos in self.rev_pos:
211 self.rev_pos.add(start_pos)
212 return True
213 return False
214 else:
215 if read.pos != self.last_fwd_pos:
216 self.last_fwd_pos = read.pos
217 return True
218 return False
219
220 # if self.last_pos != (read.tid, read.pos):
221 # self.last_pos = (read.tid, read.pos)
222 # return True
223 # return False
224
225 def close(self):
226 pass
227
228
229 class Blacklist(object):
230 def __init__(self, fname):
231 self.fname = fname
232 self.notallowed = set()
233 with open(fname) as f:
234 for line in f:
235 self.notallowed.add(line.strip().split()[0])
236
237 def filter(self, bam, read):
238 return read.qname not in self.notallowed
239
240 def __repr__(self):
241 return 'Blacklist: %s' % (self.fname)
242
243 def close(self):
244 pass
245
246
247 class Whitelist(object):
248 def __init__(self, fname):
249 self.fname = fname
250 self.allowed = set()
251 with open(fname) as f:
252 for line in f:
253 self.allowed.add(line.strip().split()[0])
254
255 def filter(self, bam, read):
256 return read.qname in self.allowed
257
258 def __repr__(self):
259 return 'Whitelist: %s' % (self.fname)
260
261 def close(self):
262 pass
263
264
265 class IncludeRegion(object):
266 _excludes = []
267 _last = None
268
269 def __init__(self, region):
270 IncludeRegion._excludes.append(ExcludeRegion(region))
271 # self.excl = ExcludeRegion(region)
272
273 def filter(self, bam, read):
274 if read == IncludeRegion._last:
275 return True
276
277 IncludeRegion._last = read
278
279 for excl in IncludeRegion._excludes:
280 if not excl.filter(bam, read):
281 return True
282 return False
283
284 # return not self.excl.filter(bam,read)
285 def __repr__(self):
286 return 'Including: %s' % (self.excl.region)
287
288 def close(self):
289 pass
290
291
292 class IncludeBED(object):
293 def __init__(self, fname, nostrand=None):
294 self.excl = ExcludeBED(fname, nostrand)
295
296 def filter(self, bam, read):
297 return not self.excl.filter(bam, read)
298
299 def __repr__(self):
300 return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '')
301
302 def close(self):
303 pass
304
305
306 class ExcludeRegion(object):
307 def __init__(self, region):
308 self.region = region
309 spl = region.split(':')
310 self.chrom = spl[0]
311 se = [int(x) for x in spl[1].split('-')]
312 self.start = se[0] - 1
313 self.end = se[1]
314
315 def filter(self, bam, read):
316 if not read.is_unmapped:
317 if bam.getrname(read.tid) == self.chrom:
318 if self.start <= read.pos <= self.end:
319 return False
320 if self.start <= read.aend <= self.end:
321 return False
322 return True
323
324 def __repr__(self):
325 return 'Excluding: %s' % (self.region)
326
327 def close(self):
328 pass
329
330
331 class ExcludeRef(object):
332 def __init__(self, ref):
333 self.ref = ref
334
335 def filter(self, bam, read):
336 if not read.is_unmapped:
337 if bam.getrname(read.tid) == self.ref:
338 return False
339 return True
340
341 def __repr__(self):
342 return 'Excluding: %s' % (self.ref)
343
344 def close(self):
345 pass
346
347 class IncludeRef(object):
348 def __init__(self, ref):
349 self.ref = ref
350
351 def filter(self, bam, read):
352 if not read.is_unmapped:
353 if bam.getrname(read.tid) == self.ref:
354 return True
355 return False
356
357 def __repr__(self):
358 return 'Including: %s' % (self.ref)
359
360 def close(self):
361 pass
362
363
364 class ExcludeBED(object):
365 def __init__(self, fname, nostrand=None):
366 self.regions = {} # store BED regions as keyed bins (chrom, bin)
367 self.fname = fname
368 if nostrand == 'nostrand':
369 self.nostrand = True
370 else:
371 self.nostrand = False
372
373 self.bed = BedFile(fname)
374 # with open(fname) as f:
375 # for line in f:
376 # if not line:
377 # continue
378 # if line[0] == '#':
379 # continue
380 # cols = line.strip().split('\t')
381
382 # chrom = cols[0]
383 # start = int(cols[1])
384 # end = int(cols[2])
385 # if self.nostrand:
386 # strand = '?'
387 # else:
388 # strand = cols[5]
389
390 # startbin = start / 100000
391 # endbin = end / 100000
392
393 # for bin in xrange(startbin, endbin + 1):
394 # if not (chrom, bin) in self.regions:
395 # self.regions[(chrom, bin)] = []
396 # self.regions[(chrom, bin)].append((start, end, strand))
397
398 def filter(self, bam, read):
399 if not read.is_unmapped:
400 if self.nostrand:
401 strand = None
402 elif read.is_reverse:
403 strand = '-'
404 else:
405 strand = '+'
406
407 for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand):
408 # region found, exclude read
409 return False
410 return True
411
412 # bin = read.pos / 100000
413 # ref = bam.getrname(read.tid)
414
415 # if not (ref, bin) in self.regions:
416 # return True
417
418 # for start, end, strand in self.regions[(ref, bin)]:
419 # if not self.nostrand:
420 # if strand == '+' and read.is_reverse:
421 # continue
422 # if strand == '-' and not read.is_reverse:
423 # continue
424 # if start <= read.pos <= end:
425 # return False
426 # if start <= read.aend <= end:
427 # return False
428 # return True
429
430 def __repr__(self):
431 return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '')
432
433 def close(self):
434 pass
435
436
437 class Mismatch(object):
438 def __init__(self, num):
439 self.num = int(num)
440
441 def filter(self, bam, read):
442 if read.is_unmapped:
443 return False
444
445 if read_calc_mismatches(read) > self.num:
446 return False
447
448 return True
449
450 def __repr__(self):
451 return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es')
452
453 def close(self):
454 pass
455
456
457 class MismatchRef(object):
458 def __init__(self, num, refname):
459 self.num = int(num)
460 self.refname = refname
461
462 if not os.path.exists('%s.fai' % refname):
463 pysam.faidx(refname)
464
465 self.ref = pysam.Fastafile(refname)
466
467 def filter(self, bam, read):
468 if read.is_unmapped:
469 return False
470
471 chrom = bam.getrname(read.tid)
472 if read_calc_mismatches_ref(self.ref, read, chrom) > self.num:
473 return False
474
475 return True
476
477 def __repr__(self):
478 return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname))
479
480 def close(self):
481 self.ref.close()
482
483
484 class MismatchDbSNP(object):
485 def __init__(self, num, fname, verbose=None):
486 sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n')
487
488 self.num = int(num)
489 self.fname = fname
490 self.dbsnp = DBSNP(fname)
491 if verbose == 'verbose':
492 self.verbose = True
493 else:
494 self.verbose = False
495
496 def filter(self, bam, read):
497 if read.is_unmapped:
498 return False
499
500 if read_calc_mismatches(read) <= self.num:
501 return True
502
503 chrom = bam.getrname(read.tid)
504
505 mm = 0
506 snps = 0
507
508 for op, pos, seq in read_calc_variations(read):
509 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose):
510 mm += 1
511 else:
512 snps += 1
513
514 if mm > self.num:
515 return False
516
517 if snps:
518 read.tags = read.tags + [('ZS', snps)]
519
520 return True
521
522 def __repr__(self):
523 return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname))
524
525 def close(self):
526 self.dbsnp.close()
527
528
529 class MismatchRefDbSNP(object):
530 def __init__(self, num, refname, dbsnpname):
531 sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n')
532 self.num = int(num)
533 self.refname = refname
534 self.dbsnp = DBSNP(dbsnpname)
535
536 if not os.path.exists('%s.fai' % refname):
537 pysam.faidx(refname)
538
539 self.ref = pysam.Fastafile(refname)
540
541 def filter(self, bam, read):
542 if read.is_unmapped:
543 return False
544
545 chrom = bam.getrname(read.tid)
546
547 mm = 0
548 snps = 0
549
550 for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom):
551 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq):
552 mm += 1
553 else:
554 snps += 1
555
556 if mm > self.num:
557 return False
558
559 if snps:
560 read.tags = read.tags + [('ZS', snps)]
561
562 return True
563
564 def __repr__(self):
565 return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname))
566
567 def close(self):
568 self.ref.close()
569 self.dbsnp.close()
570
571
572 class Mapped(object):
573 def __init__(self):
574 pass
575
576 def filter(self, bam, read):
577 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
578 return False
579 elif read.is_unmapped:
580 return False
581 return True
582
583 def __repr__(self):
584 return 'is mapped'
585
586 def close(self):
587 pass
588
589
590 class Unmapped(object):
591 def __init__(self):
592 pass
593
594 def filter(self, bam, read):
595 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
596 return True
597 elif read.is_unmapped:
598 return True
599 return False
600
601 def __repr__(self):
602 return 'is unmapped'
603
604 def close(self):
605 pass
606
607
608 class ProperPair(object):
609 def __init__(self):
610 pass
611
612 def filter(self, bam, read):
613 if not read.is_paired:
614 return False
615
616 if read.is_unmapped or read.mate_is_unmapped:
617 return False
618
619 if read.is_reverse == read.mate_is_reverse:
620 return False
621
622 return read.is_proper_pair
623
624 def __repr__(self):
625 return 'proper pair'
626
627 def close(self):
628 pass
629
630
631 class NoProperPair(object):
632 def __init__(self):
633 self.proper = ProperPair()
634 pass
635
636 def filter(self, bam, read):
637 return not self.proper.filter(bam, read)
638
639 def __repr__(self):
640 return 'not proper pairs'
641
642 def close(self):
643 pass
644
645
646 class MaskFlag(object):
647 def __init__(self, value):
648 if type(value) == type(1):
649 self.flag = value
650 else:
651 if value[0:2] == '0x':
652 self.flag = int(value, 16)
653 else:
654 self.flag = int(value)
655
656 def __repr__(self):
657 return "Doesn't match flag: %s" % self.flag
658
659 def filter(self, bam, read):
660 return (read.flag & self.flag) == 0
661
662 def close(self):
663 pass
664
665
666 class SecondaryFlag(object):
667 def __repr__(self):
668 return "no 0x100 (secondary) flag"
669
670 def filter(self, bam, read):
671 return not read.is_secondary
672
673 def close(self):
674 pass
675
676
677 class ReadMinLength(object):
678 def __init__(self, minval):
679 self.minval = int(minval)
680
681 def __repr__(self):
682 return "read length min: %s" % self.minval
683
684 def filter(self, bam, read):
685 return len(read.seq) >= self.minval
686
687 def close(self):
688 pass
689
690
691 class ReadMaxLength(object):
692 def __init__(self, val):
693 self.val = int(val)
694
695 def __repr__(self):
696 return "read length max: %s" % self.val
697
698 def filter(self, bam, read):
699 return len(read.seq) <= self.val
700
701 def close(self):
702 pass
703
704
705 class MaximumMismatchRatio(object):
706 def __init__(self, ratio):
707 self.ratio = float(ratio)
708
709 def __repr__(self):
710 return "maximum mismatch ratio: %s" % self.val
711
712 def filter(self, bam, read):
713 return read_calc_mismatches(read) <= self.ratio*len(read.seq)
714
715 def close(self):
716 pass
717
718
719 class QCFailFlag(object):
720 def __repr__(self):
721 return "no 0x200 (qcfail) flag"
722
723 def filter(self, bam, read):
724 return not read.is_qcfail
725
726 def close(self):
727 pass
728
729
730 class PCRDupFlag(object):
731 def __repr__(self):
732 return "no 0x400 (pcrdup) flag"
733
734 def filter(self, bam, read):
735 return not read.is_duplicate
736
737 def close(self):
738 pass
739
740
741 class _TagCompare(object):
742 def __init__(self, tag, value):
743 self.args = '%s %s' % (tag, value)
744
745 if ':' in tag:
746 self.tag = tag.split(':')[0]
747 tagtype = tag.split(':')[1]
748 if tagtype == 'i':
749 self.value = int(value)
750 elif tagtype == 'f':
751 self.value = float(value)
752 elif tagtype == 'H':
753 self.value = int(value) # not sure about this
754 else:
755 self.value = value
756 else:
757 self.tag = tag
758
759 # guess at type...
760 try:
761 self.value = int(value)
762 except:
763 try:
764 self.value = float(value)
765 except:
766 self.value = value
767
768 def get_value(self, read):
769 if self.tag == 'MAPQ':
770 return read.mapq
771
772 for name, value in read.tags:
773 if name == self.tag:
774 return value
775
776 return None
777
778 def __repr__(self):
779 return "%s %s %s" % (self.tag, self.__class__.op, self.value)
780
781 def close(self):
782 pass
783
784
785 class TagLessThan(_TagCompare):
786 op = '<'
787
788 def filter(self, bam, read):
789 if self.get_value(read) < self.value:
790 return True
791 return False
792
793
794 class TagLessThanEqual(_TagCompare):
795 op = '<='
796
797 def filter(self, bam, read):
798 if self.get_value(read) <= self.value:
799 return True
800 return False
801
802
803 class TagGreaterThan(_TagCompare):
804 op = '>'
805
806 def filter(self, bam, read):
807 if self.get_value(read) > self.value:
808 return True
809 return False
810
811
812 class TagGreaterThanEqual(_TagCompare):
813 op = '>='
814
815 def filter(self, bam, read):
816 if self.get_value(read) >= self.value:
817 return True
818 return False
819
820
821 class TagEqual(_TagCompare):
822 op = '='
823
824 def filter(self, bam, read):
825 if self.get_value(read) == self.value:
826 return True
827 return False
828
829 _criteria = {
830 'mapped': Mapped,
831 'unmapped': Unmapped,
832 'properpair': ProperPair,
833 'noproperpair': NoProperPair,
834 'noqcfail': QCFailFlag,
835 'nosecondary': SecondaryFlag,
836 'nopcrdup': PCRDupFlag,
837 'mask': MaskFlag,
838 'lt': TagLessThan,
839 'gt': TagGreaterThan,
840 'lte': TagLessThanEqual,
841 'gte': TagGreaterThanEqual,
842 'eq': TagEqual,
843 'mismatch': Mismatch,
844 'mismatch_ref': MismatchRef,
845 'mismatch_dbsnp': MismatchDbSNP,
846 'mismatch_ref_dbsnp': MismatchRefDbSNP,
847 'exclude': ExcludeRegion,
848 'excludebed': ExcludeBED,
849 'excluderef': ExcludeRef,
850 'includeref': IncludeRef,
851 'include': IncludeRegion,
852 'includebed': IncludeBED,
853 'whitelist': Whitelist,
854 'blacklist': Blacklist,
855 'length': ReadMinLength, # deprecated
856 'minlen': ReadMinLength,
857 'maxlen': ReadMaxLength,
858 'uniq': Unique,
859 'maximum_mismatch_ratio': MaximumMismatchRatio
860 }
861
862
863 def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False):
864 if verbose:
865 sys.stderr.write('Input file : %s\n' % infile)
866 sys.stderr.write('Output file : %s\n' % outfile)
867 if failedfile:
868 sys.stderr.write('Failed reads: %s\n' % failedfile)
869 sys.stderr.write('Criteria:\n')
870 for criterion in criteria:
871 sys.stderr.write(' %s\n' % criterion)
872
873 sys.stderr.write('\n')
874
875 bamfile = pysam.Samfile(infile, "rb")
876 outfile = pysam.Samfile(outfile, "wb", template=bamfile)
877
878 if failedfile:
879 failed_out = open(failedfile, 'w')
880 else:
881 failed_out = None
882
883 passed = 0
884 failed = 0
885
886 def _callback(read):
887 return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed)
888
889 for read in bam_iter(bamfile, quiet=True):
890 p = True
891
892 for criterion in criteria:
893 if not criterion.filter(bamfile, read):
894 p = False
895 failed += 1
896 if failed_out:
897 failed_out.write('%s\t%s\n' % (read.qname, criterion))
898 #outfile.write(read_to_unmapped(read))
899 break
900 if p:
901 passed += 1
902 outfile.write(read)
903
904 bamfile.close()
905 outfile.close()
906 if failed_out:
907 failed_out.close()
908 sys.stdout.write("%s kept\n%s failed\n" % (passed, failed))
909
910 for criterion in criteria:
911 criterion.close()
912
913
914 def read_to_unmapped(read):
915 '''
916 Example unmapped read
917 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4
918
919 Example mapped read:
920 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2----------------------------
921
922 '''
923 # TODO: rewrite read properties to unmapped state
924 # if colorspace: convert CS to NT directly
925 # remove excess tags
926
927 read.is_unmapped = True
928 read.tid = None
929 read.pos = 0
930 read.mapq = 0
931 return read
932
933 if __name__ == '__main__':
934 infile = None
935 outfile = None
936 failed = None
937 criteria = []
938
939 crit_args = []
940 last = None
941 verbose = False
942 fail = False
943
944 for arg in sys.argv[1:]:
945 if last == '-failed':
946 failed = arg
947 last = None
948 elif arg == '-h':
949 usage()
950 elif arg == '-failed':
951 last = arg
952 elif arg == '-v':
953 verbose = True
954 elif not infile and os.path.exists(arg):
955 infile = arg
956 elif not outfile:
957 outfile = arg
958 elif arg[0] == '-':
959 if not arg[1:] in _criteria:
960 print "Unknown criterion: %s" % arg
961 fail = True
962 if crit_args:
963 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
964 crit_args = [arg, ]
965 elif crit_args:
966 crit_args.append(arg)
967 else:
968 print "Unknown argument: %s" % arg
969 fail = True
970
971 if not fail and crit_args:
972 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
973
974 if fail or not infile or not outfile or not criteria:
975 if not infile and not outfile and not criteria:
976 usage()
977
978 if not infile:
979 print "Missing: input bamfile"
980 if not outfile:
981 print "Missing: output bamfile"
982 if not criteria:
983 print "Missing: filtering criteria"
984 usage()
985 else:
986 bam_filter(infile, outfile, criteria, failed, verbose)