view blast_parser.py @ 4:363f3480622d draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
author earlhaminst
date Thu, 12 Oct 2017 07:25:29 -0400
parents 70df762b48a8
children
line wrap: on
line source

"""
Simple parser to convert a BLAST 12-column or 24-column tabular output into a
3-column tabular input for hcluster_hg (id1, id2, weight):
"""
import argparse
import math
import sqlite3
import tempfile

BATCH_SIZE = 2000


def create_tables(conn):
    cur = conn.cursor()
    cur.execute('''CREATE TABLE alignment (
        id INTEGER PRIMARY KEY,
        sequence1_id VARCHAR NOT NULL,
        sequence2_id VARCHAR NOT NULL,
        weight INTEGER NOT NULL)''')
    conn.commit()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
    parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
    parser.add_argument('-r', action='store_true', default=False,
                        dest='reciprocal',
                        help='Annotate homolog pair')
    options = parser.parse_args()

    db_file = tempfile.NamedTemporaryFile(suffix=".sqlite")

    conn = sqlite3.connect(db_file.name)
    conn.execute('PRAGMA foreign_keys = ON')

    create_tables(conn)

    cur = conn.cursor()

    i = 0
    for line in options.i:
        line = line.rstrip()
        line_cols = line.split('\t')
        sequence1_id = line_cols[0]
        sequence2_id = line_cols[1]

        # Ignore self-matching hits
        if sequence1_id == sequence2_id:
            continue

        i = i + 1
        evalue = float(line_cols[10])

        # Convert evalue to an integer weight with max 100
        if evalue != 0.0:
            weight = min(100, round(math.log10(evalue) / -2.0))
        else:
            # If the evalue is 0, leave weight at 100
            weight = 100

        cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)',
                    (i, sequence1_id, sequence2_id, weight))

        # Commit transaction at every BATCH_SIZE rows to save memory
        if i % BATCH_SIZE == 0:
            conn.commit()
    # Commit final transaction
    conn.commit()
    options.i.close()

    # Delete alternative alignments keeping only one with max weight
    cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))')
    conn.commit()

    if options.reciprocal:
        query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id'
    else:
        query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id'

    cur.execute(query)
    while True:
        rows = cur.fetchmany(BATCH_SIZE)
        if not rows:
            break
        for row in rows:
            options.o.write("%s\t%s\t%d\n" % row)

    conn.close()
    db_file.close()
    options.o.close()


if __name__ == "__main__":
    main()