diff extractSplitReads_BwaMem.py @ 1:1ed8619a5611 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 0b55a106b1f76e3cc3d89932fef2cc8d3eb24e4f
author artbio
date Wed, 26 Jul 2017 18:17:01 -0400
parents 796552c157de
children 48e97429a749
line wrap: on
line diff
--- a/extractSplitReads_BwaMem.py	Mon Jul 24 08:03:17 2017 -0400
+++ b/extractSplitReads_BwaMem.py	Wed Jul 26 18:17:01 2017 -0400
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
 
+import re
 import sys
-import getopt
-import string
 from optparse import OptionParser
-import re
+
 
-def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap):
+def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap):
     if inFile == "stdin":
         data = sys.stdin
     else:
@@ -14,82 +13,89 @@
     for line in data:
         split = 0
         if line[0] == '@':
-            print line.strip()
+            print(line.strip())
             continue
         samList = line.strip().split('\t')
         sam = SAM(samList)
-        if includeDups==0 and (1024 & sam.flag)==1024:
+        if includeDups == 0 and (1024 & sam.flag) == 1024:
             continue
         for el in sam.tags:
             if "SA:" in el:
-                if(len(el.split(";")))<=numSplits:
+                if(len(el.split(";"))) <= numSplits:
                     split = 1
                     mate = el.split(",")
                     mateCigar = mate[3]
                     mateFlag = int(0)
-                    if mate[2]=="-": mateFlag = int(16)
+                    if mate[2] == "-":
+                        mateFlag = int(16)
         if split:
             read1 = sam.flag & 64
-            if read1 == 64: tag = "_1"
-            else: tag="_2"
+            if read1 == 64:
+                tag = "_1"
+            else:
+                tag = "_2"
             samList[0] = sam.query + tag
             readCigar = sam.cigar
-            readCigarOps = extractCigarOps(readCigar,sam.flag)
+            readCigarOps = extractCigarOps(readCigar, sam.flag)
             readQueryPos = calcQueryPosFromCigar(readCigarOps)
-            mateCigarOps = extractCigarOps(mateCigar,mateFlag)
+            mateCigarOps = extractCigarOps(mateCigar, mateFlag)
             mateQueryPos = calcQueryPosFromCigar(mateCigarOps)
-            overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos)
+            overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos,
+                                       mateQueryPos.qsPos, mateQueryPos.qePos)
             nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap
             nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap
             mno = min(nonOverlap1, nonOverlap2)
             if mno >= minNonOverlap:
-                print "\t".join(samList)
+                print("\t".join(samList))
 
-#--------------------------------------------------------------------------------------------------
+# -----------------------------------------------------------------------
 # functions
-#--------------------------------------------------------------------------------------------------
+# -----------------------------------------------------------------------
+
 
 class SAM (object):
     """
     __very__ basic class for SAM input.
     """
-    def __init__(self, samList = []):
+    def __init__(self, samList=[]):
         if len(samList) > 0:
-            self.query    = samList[0]
-            self.flag     = int(samList[1])
-            self.ref      = samList[2]
-            self.pos      = int(samList[3])
-            self.mapq     = int(samList[4])
-            self.cigar    = samList[5]
-            self.matRef   = samList[6]
-            self.matePos  = int(samList[7])
-            self.iSize    = int(samList[8])
-            self.seq      = samList[9]
-            self.qual     = samList[10]
-            self.tags     = samList[11:]#tags is a list of each tag:vtype:value sets
-            self.valid    = 1
+            self.query = samList[0]
+            self.flag = int(samList[1])
+            self.ref = samList[2]
+            self.pos = int(samList[3])
+            self.mapq = int(samList[4])
+            self.cigar = samList[5]
+            self.matRef = samList[6]
+            self.matePos = int(samList[7])
+            self.iSize = int(samList[8])
+            self.seq = samList[9]
+            self.qual = samList[10]
+            self.tags = samList[11:]
+            # tags is a list of each tag:vtype:value sets
+            self.valid = 1
         else:
             self.valid = 0
             self.query = 'null'
 
-    def extractTagValue (self, tagID):
+    def extractTagValue(self, tagID):
         for tag in self.tags:
-            tagParts = tag.split(':', 2);
+            tagParts = tag.split(':', 2)
             if (tagParts[0] == tagID):
                 if (tagParts[1] == 'i'):
-                    return int(tagParts[2]);
+                    return int(tagParts[2])
                 elif (tagParts[1] == 'H'):
-                    return int(tagParts[2],16);
-                return tagParts[2];
-        return None;
+                    return int(tagParts[2], 16)
+                return tagParts[2]
+        return None
 
-#-----------------------------------------------
+
 cigarPattern = '([0-9]+[MIDNSHP])'
 cigarSearch = re.compile(cigarPattern)
 atomicCigarPattern = '([0-9]+)([MIDNSHP])'
 atomicCigarSearch = re.compile(atomicCigarPattern)
 
-def extractCigarOps(cigar,flag):
+
+def extractCigarOps(cigar, flag):
     if (cigar == "*"):
         cigarOps = []
     elif (flag & 0x0010):
@@ -104,7 +110,7 @@
             cigarOps.append(cigar)
             cigarOps = cigarOps
         cigarOps.reverse()
-        ##do in reverse order because negative strand##
+        # do in reverse order because negative strand
     else:
         cigarOpStrings = cigarSearch.findall(cigar)
         cigarOps = []
@@ -117,34 +123,38 @@
 #            cigarOps = cigarOps
     return(cigarOps)
 
+
 def calcQueryPosFromCigar(cigarOps):
     qsPos = 0
     qePos = 0
-    qLen  = 0
+    qLen = 0
     # if first op is a H, need to shift start position
-    # the opPosition counter sees if the for loop is looking at the first index of the cigar object
+    # the opPosition counter sees if the for loop is looking
+    # at the first index of the cigar object
     opPosition = 0
     for cigar in cigarOps:
         if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'):
             qsPos += cigar.length
             qePos += cigar.length
-            qLen  += cigar.length
+            qLen += cigar.length
         elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'):
-            qLen  += cigar.length
+            qLen += cigar.length
         elif cigar.op == 'M' or cigar.op == 'I':
             qePos += cigar.length
-            qLen  += cigar.length
+            qLen += cigar.length
             opPosition += 1
-    d = queryPos(qsPos, qePos, qLen);
+    d = queryPos(qsPos, qePos, qLen)
     return d
 
+
 class cigarOp (object):
     """
     sturct to store a discrete CIGAR operations
     """
     def __init__(self, opLength, op):
         self.length = int(opLength)
-        self.op     = op
+        self.op = op
+
 
 class queryPos (object):
     """
@@ -153,50 +163,60 @@
     def __init__(self, qsPos, qePos, qLen):
         self.qsPos = int(qsPos)
         self.qePos = int(qePos)
-        self.qLen  = int(qLen)
+        self.qLen = int(qLen)
 
 
-def calcQueryOverlap(s1,e1,s2,e2):
+def calcQueryOverlap(s1, e1, s2, e2):
     o = 1 + min(e1, e2) - max(s1, s2)
     return max(0, o)
 
 ###############################################
 
+
 class Usage(Exception):
     def __init__(self, msg):
         self.msg = msg
 
+
 def main():
-
     usage = """%prog -i <file>
 
 extractSplitReads_BwaMem v0.1.0
 Author: Ira Hall
-Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates.
+Description: Get split-read alignments from bwa-mem in lumpy compatible
+format. Ignores reads marked as duplicates.
 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405.
     """
     parser = OptionParser(usage)
 
     parser.add_option("-i", "--inFile", dest="inFile",
-        help="A SAM file or standard input (-i stdin).",
-        metavar="FILE")
-    parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int",
-        help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2",
-        metavar="INT")
-    parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0,
-        help="Include alignments marked as duplicates. Default=False")
-    parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int",
-        help="minimum non-overlap between split alignments on the query (default=20)",
-        metavar="INT")
+                      help="A SAM file or standard input (-i stdin).",
+                      metavar="FILE")
+    parser.add_option("-n", "--numSplits", dest="numSplits", default=2,
+                      type="int",
+                      help='''The maximum number of split-read mappings to
+                      allow per read. Reads with more are excluded.
+                      Default=2''', metavar="INT")
+    parser.add_option("-d", "--includeDups", dest="includeDups",
+                      action="store_true", default=0,
+                      help='''Include alignments marked as duplicates.
+                      Default=False''')
+    parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap",
+                      default=20, type="int", help='''minimum non-overlap between
+                      split alignments on the query (default=20)''',
+                      metavar="INT")
     (opts, args) = parser.parse_args()
     if opts.inFile is None:
         parser.print_help()
         print
     else:
         try:
-            extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap)
+            extractSplitsFromBwaMem(opts.inFile, opts.numSplits,
+                                    opts.includeDups, opts.minNonOverlap)
         except IOError as err:
-            sys.stderr.write("IOError " + str(err) + "\n");
+            sys.stderr.write("IOError " + str(err) + "\n")
             return
+
+
 if __name__ == "__main__":
     sys.exit(main())