Mercurial > repos > earlhaminst > t_coffee
view filter_by_fasta_ids.py @ 8:ae69d14b6fbf draft default tip
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 28bbc172f28d9fbe7ed2795043ff61d9e0642d13"
author | earlhaminst |
---|---|
date | Thu, 14 Jan 2021 12:14:52 +0000 |
parents | 0a189243186d |
children |
line wrap: on
line source
#!/usr/bin/env python """ A script to build specific fasta databases """ from __future__ import print_function import collections import sys Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) def FASTAReader_gen(fasta_filename): with open(fasta_filename) as fasta_file: line = fasta_file.readline() while True: if not line: return assert line.startswith('>'), "FASTA headers must start with >" header = line.rstrip() sequence_parts = [] line = fasta_file.readline() while line and line[0] != '>': sequence_parts.append(line.rstrip()) line = fasta_file.readline() sequence = "".join(sequence_parts) yield Sequence(header, sequence) def target_match(target, search_entry): ''' Matches ''' search_entry = search_entry.upper() for atarget in target: if search_entry.find(atarget) > -1: return atarget return None def main(): used_sequences = set() work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} with open(sys.argv[1]) as f_target: targets = [">%s" % _.strip().upper() for _ in f_target] work_summary['wanted'] = len(targets) for entry in FASTAReader_gen(sys.argv[2]): target_matched_results = target_match(targets, entry.header) if target_matched_results: work_summary['found'] += 1 targets.remove(target_matched_results) sequence = entry.sequence used_sequences.add(sequence) print(entry.header) print(sequence) if __name__ == "__main__": main()