diff fasta_charges.py @ 1:054f96a0d0fb draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:41:30 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_charges.py	Mon Jun 05 02:41:30 2023 +0000
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+import argparse
+from Bio import SeqIO
+import logging
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(name="charges")
+
+HTML_HEADER = "<html><head><title>Charges Report</title></head><body>"
+HTML_FOOTER = "</body></html>"
+
+SVG_HEADER = '<svg width="%i" height="%i" xmlns="http://www.w3.org/2000/svg">\n'  # % (calcWidth, calcHeight)
+SVG_FOOTER = "</svg>"
+
+FULL_AA = [
+    "H",
+    "S",
+    "Q",
+    "T",
+    "N",
+    "C",
+    "Y",
+    "A",
+    "V",
+    "I",
+    "L",
+    "M",
+    "P",
+    "F",
+    "W",
+    "G",
+    "E",
+    "R",
+    "D",
+    "K",
+]
+
+
+def charges_html(svg, fasta, aa, fgColor, bgColor, width=120):
+    colour_scheme = zip([x.upper() for x in aa], bgColor, fgColor)
+    copy_colour_scheme = zip([x.upper() for x in aa], bgColor, fgColor)
+
+    # CSS and header styling
+    css = """<style type="text/css">
+    .list li { list-style: none; margin:10px }
+    .info { float:left; width:20px }
+    pre { font-size:1.3em }
+    """
+    info = '<h1>Charges</h1><h3>Legend</h3><ul class="list">'
+    for group in colour_scheme:
+        css += ".%s{ background: %s; color: %s}\n" % group
+        info += '<li><span class="%s" style="padding:5px">%s</span></li>\n' % (
+            group[0],
+            group[0],
+        )
+    css += "</style>"
+    info += "</ul>"
+
+    # Pre-calculate, so we can use for testing 'in'
+    match_list = [group[0] for group in copy_colour_scheme]
+
+    page = ""
+    # Parse sequences from fasta file
+    for record in SeqIO.parse(fasta, "fasta"):
+        page += "<pre><h3>&gt;%s %s</h3>\n" % (record.id, record.description)
+        seq = list(str(record.seq).upper())
+
+        idx = 0
+        for i in range(0, len(seq), width):
+            line_charges = []
+            line_residues = seq[i : i + width]
+            line_numbers = []
+
+            for char in range(len(line_residues)):
+                if line_residues[char] in "KRkr":
+                    line_charges.append("+")
+                elif line_residues[char] in "DEde":
+                    line_charges.append("-")
+                else:
+                    line_charges.append(" ")
+
+                # Could be swapped out for math with i+char...
+                idx += 1
+                if idx % 10 == 0:
+                    line_numbers.append("%10s" % idx)
+
+                # Replace with <span>
+                for m in match_list:
+                    if line_residues[char].upper() in m:
+                        line_residues[char] = '<span class="%s">%s</span>' % (
+                            m,
+                            line_residues[char],
+                        )
+
+            page += "".join(line_charges) + "\n"
+            page += "".join(line_residues) + "\n"
+            page += "".join(line_numbers) + "\n"
+            page += "\n"
+        page += "</pre>"
+    return HTML_HEADER + css + info + page + HTML_FOOTER
+
+
+def charges_svg(svg, fasta, aa, fgColor, bgColor, width=120):
+    colour_scheme = zip([x.upper() for x in aa], bgColor, fgColor)
+
+    svgWidth = 1100
+
+    # CSS and header styling
+    classList = []
+    classes = '<style type="text/css">\n<![CDATA[\n'
+
+    defClass = ""
+    for x in FULL_AA:
+        addAA = True
+        for y in aa:
+            if x in y:
+                addAA = False
+        if addAA:
+            defClass += x
+
+    defBox = "#ffffff"
+    defText = "#000000"
+
+    for group in colour_scheme:
+        classList.append(group[0])
+        classes += "text.text_%s{fill: %s;}\n" % (group[0], group[2])
+        classes += "rect.rect_%s{fill: %s; stroke: %s;}\n" % (
+            group[0],
+            group[1],
+            group[1],
+        )
+        # info += '<li><span class="%s" style="padding:5px">%s</span></li>\n' % (group[0], group[0])
+    if defClass != "":
+        classes += "text.text_%s{fill: %s;}\n" % (defClass, defText)
+        classes += "rect.rect_%s{fill: %s; stroke: %s;}\n" % (defClass, defBox, defBox)
+        classList.append(defClass)
+    classes += "text.info_text{white-space: pre;}\n"
+    classes += "rect.rEven{fill: #fdfdfd; stroke: #fbfbfb;}\n"
+    classes += "rect.rOdd{fill: #f2f2fc; stroke: #fbfbfb;}\n"
+    classes += "]]></style>\n"
+    body = ""
+    groups = ""
+    # Pre-calculate, so we can use for testing 'in'
+
+    match_list = aa
+    prevIndex = -1
+    boxLen = 0
+    page = ""
+    title = ""
+
+    yInd = 60
+    yInc = 15
+    seqIndent = 35
+    idIndent = 20
+    letterLen = 8.4375
+    recNum = -1
+
+    title += (
+        '<text x="'
+        + str(idIndent * 0.5)
+        + '" y="'
+        + str(yInd)
+        + '" style="font-weight:bold; font-size:40px">Charges</text>\n'
+    )
+    yInd += 2 * yInc
+    title += (
+        '<text x="'
+        + str(idIndent)
+        + '" y="'
+        + str(yInd)
+        + '" style="font-size:18px">Legend:</text>\n'
+    )
+    yInd += 2 * yInc
+
+    for i in range(len(classList)):
+        title += (
+            '<rect x="'
+            + str(seqIndent)
+            + '" y="'
+            + str(yInd - yInc + 2)
+            + '" width="'
+            + str(len(classList[i]) * letterLen)
+            + '" height="'
+            + str(yInc)
+            + '" class="rect_%s"/>\n' % classList[i]
+        )
+        title += (
+            '<text x="'
+            + str(seqIndent)
+            + '" y="'
+            + str(yInd)
+            + '" class="text_%s" font-family="monospace" font-size="14">%s</text>\n'
+            % (classList[i], classList[i])
+        )
+        yInd += yInc + 3
+    yInd += yInc * 1.5
+
+    # Parse sequences from fasta file
+    for record in SeqIO.parse(fasta, "fasta"):
+
+        recNum += 1
+        seqHeader = (
+            '<g><text x="'
+            + str(idIndent)
+            + '" y="'
+            + str(yInd)
+            + '" style="font-weight:bold">&gt;%s %s</text>\n'
+            % (record.id, record.description)
+        )
+        body += seqHeader
+        seq = list(str(record.seq).upper())
+        yTop = yInd - yInc - 3
+        yInd += yInc
+        idx = 0
+        for i in range(0, len(seq), width):
+            line_charges = []
+            line_residues = seq[i : i + width]
+            line_numbers = []
+
+            boxList = []
+            groupList = []
+            seqList = []
+            prevIndex = -1
+            boxLen = 0
+            for char in range(len(line_residues)):
+
+                thisInd = 0
+                for x in match_list:
+
+                    if line_residues[char] in x:
+                        break
+                    thisInd += 1
+
+                if thisInd == len(match_list):
+                    thisInd = -1
+
+                if char != 0 and thisInd != prevIndex:
+                    boxList.append(boxLen)
+                    seqList.append((line_residues[char - boxLen : char]))
+                    groupList.append(prevIndex)
+                    boxLen = 0
+                prevIndex = thisInd
+                boxLen += 1
+
+                if line_residues[char] in "KRkr":
+                    line_charges.append("+")
+                elif line_residues[char] in "DEde":
+                    line_charges.append("-")
+                else:
+                    line_charges.append(" ")
+
+                # Could be swapped out for math with i+char...
+                idx += 1
+                if idx % 10 == 0:
+                    line_numbers.append("%10s" % idx)
+
+                # Replace with <span>
+                # for m in match_list:
+                #    if line_residues[char].upper() in m:
+                #        line_residues[char] = '<span class="%s">%s</span>' % (m, line_residues[char])
+
+            seqList.append((line_residues[-boxLen:]))
+            boxList.append(boxLen)
+            groupList.append(prevIndex)
+            # Write line charges
+            line = "".join(line_charges)
+            body += (
+                '<text x="'
+                + str(seqIndent)
+                + '" y="'
+                + str(yInd)
+                + '" class="info_text" font-family="monospace" font-size="14">%s</text>\n'
+                % line
+            )
+            yInd += yInc
+            # Write sequence
+            sumSeq = 0
+            for i in range(len(seqList)):
+                res = ""
+                for sub in seqList[i]:
+                    res += sub
+                body += (
+                    '<rect x="'
+                    + str(0.5 + seqIndent + (letterLen * sumSeq))
+                    + '" y="'
+                    + str(yInd - yInc + 2)
+                    + '" width="'
+                    + str(boxList[i] * letterLen)
+                    + '" height="'
+                    + str(yInc)
+                    + '" class="rect_%s"/>\n' % classList[groupList[i]]
+                )
+                body += (
+                    '<text x="'
+                    + str(seqIndent + (letterLen * sumSeq))
+                    + '" y="'
+                    + str(yInd)
+                    + '" class="text_%s" font-family="monospace" font-size="14">%s</text>\n'
+                    % (classList[groupList[i]], res)
+                )
+                sumSeq += len(seqList[i])
+            yInd += yInc
+            # Write numbers
+            line = "".join(line_numbers) + "\n"
+            body += (
+                '<text x="'
+                + str(seqIndent)
+                + '" y="'
+                + str(yInd)
+                + '" class="info_text" font-size="14" font-family="monospace">%s</text>\n'
+                % line
+            )
+            yInd += yInc
+
+        body += "</g>\n"
+        yInd += yInc
+        if recNum % 2 == 0:
+            groups += (
+                '<rect x="0" y="'
+                + str(yTop)
+                + '" width="'
+                + str(svgWidth + 1)
+                + '" height="'
+                + str(yInd - yTop)
+                + '" class="rEven"/>\n'
+            )
+        else:
+            groups += (
+                '<rect x="0" y="'
+                + str(yTop)
+                + '" width="'
+                + str(svgWidth + 1)
+                + '" height="'
+                + str(yInd - yTop)
+                + '" class="rOdd"/>\n'
+            )
+    svgHeight = yInd
+
+    return (
+        (SVG_HEADER % (svgWidth, svgHeight))
+        + title
+        + classes
+        + groups
+        + body
+        + SVG_FOOTER
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Top related genomes")
+    parser.add_argument("--svg", action="store_true")
+    parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta protein file")
+    parser.add_argument("--width", type=int, help="Plot width", default=120)
+    parser.add_argument("--aa", nargs="+")
+    parser.add_argument("--fgColor", nargs="+")
+    parser.add_argument("--bgColor", nargs="+")
+
+    args = parser.parse_args()
+    if args.svg:
+        print(charges_svg(**vars(args)))
+    else:
+        print(charges_html(**vars(args)))