view cherry_pick_fasta.py @ 2:321cad0eb507 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author artbio
date Tue, 16 Mar 2021 23:25:57 +0000
parents ea8fde9c6f82
children c282a8a47dd9
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Chery pick of fasta sequences satisfying a query string in their header/name
"""

import argparse


def Parser():
    the_parser = argparse.ArgumentParser(
        description="Cherry pick fasta sequences")
    the_parser.add_argument('--input', action="store", type=str,
                            help="input fasta file")
    the_parser.add_argument('--searchfor', action="store", type=str,
                            help="with, without, or withlist, withoutlist")
    the_parser.add_argument('--query-string', dest="query_string",
                            action="store", type=str,
                            help="headers containing the string will be \
                                  extracted or excluded as well as the \
                                  corresponding sequence")
    the_parser.add_argument('--query-file', dest="query_file",
                            action="store", type=str,
                            help="headers containing any of the strings provided in the \
                                  text file (1 string per line) will be \
                                  extracted or excluded as well as the \
                                  corresponding sequence")

    the_parser.add_argument(
        '--output', action="store", type=str, help="output fasta file")
    args = the_parser.parse_args()
    return args


def parse_fasta_with(query, FastaListe):
    if not isinstance(query, list):
        query = [query]
    accumulator = []
    for sequence in FastaListe:
        for string in query:
            if string in sequence:
                accumulator.append(sequence)
                continue
    return accumulator


def complement_fasta(fullfasta, subfasta):
    return sorted(list(set(fullfasta) - set(subfasta)))


def getquerylist(file):
    querylist = []
    for line in open(file, 'r'):
        querylist.append(line.rstrip())
    return querylist


def __main__():
    """ main function """
    args = Parser()
    searchterm = args.query_string
    CrudeFasta = open(args.input, "r").read()
    Output = open(args.output, "w")
    FastaListe = CrudeFasta.split(">")[1:]
    if args.query_string:
        if args.searchfor == 'with':
            contList = parse_fasta_with(searchterm, FastaListe)
            contFasta = ">%s" % ">".join(contList)
            Output.write(contFasta)
        elif args.searchfor == 'without':
            notcontList = complement_fasta(FastaListe,
                                           parse_fasta_with(searchterm,
                                                            FastaListe))
            notcontFasta = ">%s" % ">".join(notcontList)
            Output.write(notcontFasta)
    if args.query_file:
        searchlist = getquerylist(args.query_file)
        if args.searchfor == 'with':
            contList = parse_fasta_with(searchlist, FastaListe)
            contFasta = ">%s" % ">".join(contList)
            Output.write(contFasta)
        elif args.searchfor == 'without':
            notcontList = complement_fasta(FastaListe, parse_fasta_with(
                                           searchlist, FastaListe))
            notcontFasta = ">%s" % ">".join(notcontList)
            Output.write(notcontFasta)
    Output.close()


if __name__ == "__main__":
    __main__()