Mercurial > repos > earlhaminst > t_coffee
view filter_by_fasta_ids.py @ 4:fa59d6fea7f5 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author | earlhaminst |
---|---|
date | Fri, 03 Mar 2017 07:29:32 -0500 |
parents | 78dd29aa7fc1 |
children | 0a189243186d |
line wrap: on
line source
#!/usr/bin/env python """ A script to build specific fasta databases """ from __future__ import print_function import collections import sys Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) def FASTAReader_gen(fasta_filename): with open(fasta_filename) as fasta_file: line = fasta_file.readline() while True: if not line: return assert line.startswith('>'), "FASTA headers must start with >" header = line.rstrip() sequence_parts = [] line = fasta_file.readline() while line and line[0] != '>': sequence_parts.append(line.rstrip()) line = fasta_file.readline() sequence = "".join(sequence_parts) yield Sequence(header, sequence) def target_match(target, search_entry): ''' Matches ''' search_entry = search_entry.upper() for atarget in target: if search_entry.find(atarget) > -1: return atarget return None def main(): ''' the main function''' used_sequences = set() work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} targets = [] with open(sys.argv[1]) as f_target: for line in f_target.readlines(): targets.append(">%s" % line.strip().upper()) work_summary['wanted'] = len(targets) for entry in FASTAReader_gen(sys.argv[2]): target_matched_results = target_match(targets, entry.header) if target_matched_results: work_summary['found'] += 1 targets.remove(target_matched_results) sequence = entry.sequence used_sequences.add(sequence) print(entry.header) print(sequence) if __name__ == "__main__": main()