# HG changeset patch # User galaxyp # Date 1464109522 14400 # Node ID 8d15aebf55fd85d9f9600108d2b2f32b12127d32 # Parent 463ebeccb8547f457923e4db9448c8f445b278d4 planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30 diff -r 463ebeccb854 -r 8d15aebf55fd README.md --- a/README.md Fri Sep 26 14:23:16 2014 -0400 +++ b/README.md Tue May 24 13:05:22 2016 -0400 @@ -1,7 +1,7 @@ GalaxyP - Filter by FASTA IDs ============================= -* Home: +* Home: * Galaxy Tool Shed: * Tool ID: `filter_by_fasta_ids` @@ -15,9 +15,9 @@ GalaxyP Community ----------------- -Current governing community policies for [GalaxyP](https://bitbucket.org/galaxyp/) and other information can be found at: +Current governing community policies for [GalaxyP](https://github.com/galaxyproteomics/) and other information can be found at: - + License @@ -35,7 +35,7 @@ Contributing ------------ -Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in unless you opt-out. +Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in Authors diff -r 463ebeccb854 -r 8d15aebf55fd filter_by_fasta_ids.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_fasta_ids.py Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/env python +""" A script to build specific fasta databases """ +from __future__ import print_function +import optparse + + +# ===================================== Iterator =============================== +class Sequence: + ''' Holds protein sequence information ''' + def __init__(self): + self.header = "" + self.sequence_parts = [] + + def get_sequence(self): + return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) + + +class FASTAReader: + """ + FASTA db iterator. Returns a single FASTA sequence object. + """ + def __init__(self, fasta_name): + self.fasta_file = open(fasta_name) + self.next_line = self.fasta_file.readline() + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + next_line = self.next_line + if not next_line: + raise StopIteration + + seq = Sequence() + seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') + + next_line = self.fasta_file.readline() + while next_line and next_line[0] != '>': + seq.sequence_parts.append(next_line) + next_line = self.fasta_file.readline() + self.next_line = next_line + return seq + + # Python 2/3 compat + next = __next__ + + +def target_match(target, search_entry): + ''' Matches ''' + search_entry = search_entry.upper() + for atarget in target: + if search_entry.find(atarget) > -1: + return atarget + return None + + +def main(): + ''' the main function''' + + parser = optparse.OptionParser() + parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') + (options, args) = parser.parse_args() + + targets = [] + + with open(args[0]) as f_target: + for line in f_target.readlines(): + targets.append(">%s" % line.strip().upper()) + + print('Read target file, now looking for %d sequences.' % len(targets)) + + work_summary = {'wanted': len(targets), 'found': 0} + if options.dedup: + used_sequences = set() + work_summary['duplicates'] = 0 + homd_db = FASTAReader(args[1]) + + with open(args[2], "w") as output: + for entry in homd_db: + target_matched_results = target_match(targets, entry.header) + if target_matched_results: + work_summary['found'] += 1 + targets.remove(target_matched_results) + sequence = entry.get_sequence() + if options.dedup: + if sequence in used_sequences: + work_summary['duplicates'] += 1 + continue + else: + used_sequences.add(sequence) + print(entry.header, file=output) + print(sequence, file=output) + + print('Completed filtering.') + for parm, count in work_summary.items(): + print('%s ==> %d' % (parm, count)) + +if __name__ == "__main__": + main() diff -r 463ebeccb854 -r 8d15aebf55fd filter_by_fasta_ids.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_fasta_ids.xml Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,40 @@ + + Extract sequences from a FASTA file based on a list of IDs + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 463ebeccb854 -r 8d15aebf55fd test-data/.gitkeep diff -r 463ebeccb854 -r 8d15aebf55fd test-data/ids.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ids.txt Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,5 @@ +2 +2_bis +3 +4 +6 diff -r 463ebeccb854 -r 8d15aebf55fd test-data/input.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,14 @@ +>1 +TGAC +>2 +AAAAAAAA +>3 +ACGT +>2_bis +AAAA +AAAA +>4 +ACGT +TGAC +>5 +TTTT diff -r 463ebeccb854 -r 8d15aebf55fd test-data/output_dedup.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_dedup.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,6 @@ +>2 +AAAAAAAA +>3 +ACGT +>4 +ACGTTGAC diff -r 463ebeccb854 -r 8d15aebf55fd test-data/output_not_dedup.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_not_dedup.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,8 @@ +>2 +AAAAAAAA +>3 +ACGT +>2_bis +AAAAAAAA +>4 +ACGTTGAC diff -r 463ebeccb854 -r 8d15aebf55fd tool-data/.gitkeep diff -r 463ebeccb854 -r 8d15aebf55fd tools/filter_by_fasta_ids.py --- a/tools/filter_by_fasta_ids.py Fri Sep 26 14:23:16 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -#!/usr/bin/env python -""" A script to build specific fasta databases """ -from __future__ import print_function -import sys -import logging - -#===================================== Iterator =============================== -class Sequence: - ''' Holds protein sequence information ''' - def __init__(self): - self.header = "" - self.sequence_parts = [] - - def get_sequence(self): - return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts]) - -class FASTAReader: - """ - FASTA db iterator. Returns a single FASTA sequence object. - """ - def __init__(self, fasta_name): - self.fasta_file = open(fasta_name) - self.next_line = self.fasta_file.readline() - - def __iter__(self): - return self - - def __next__(self): - ''' Iteration ''' - #while True: - # line = self.fasta_file.readline() - # if not line: - # raise StopIteration - # if line[0] == '>': - # break - next_line = self.next_line - if not next_line: - raise StopIteration - - seq = Sequence() - seq.header = next_line.rstrip().replace('\n','').replace('\r','') - - next_line = self.fasta_file.readline() - while next_line and next_line[0] != '>': - #tail = self.fasta_file.tell() - #line = self.fasta_file.readline() - #if not line: - # break - #if line[0] == '>': - # self.fasta_file.seek(tail) - # break - seq.sequence_parts.append(next_line) - next_line = self.fasta_file.readline() - self.next_line = next_line - return seq - - # Python 2/3 compat - next = __next__ -#============================================================================== - -def target_match(target, search_entry): - ''' Matches ''' - search_entry = search_entry.upper() - for atarget in target: - if search_entry.find(atarget) > -1: - return atarget - return None - - -def main(): - ''' the main function''' - logging.basicConfig(filename='filter_fasta_log', - level=logging.INFO, - format='%(asctime)s :: %(levelname)s :: %(message)s',) - - used_sequences = set() - work_summary = {'wanted': 0, 'found':0, 'duplicates':0} - targets = [] - - f_target = open(sys.argv[1]) - for line in f_target.readlines(): - targets.append(">%s" % line.strip().upper()) - f_target.close() - - logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.') - - work_summary['wanted'] = len(targets) - homd_db = FASTAReader(sys.argv[2]) - - i = 0 - output = open(sys.argv[3], "w") - try: - for entry in homd_db: - target_matched_results = target_match(targets, entry.header) - if target_matched_results: - work_summary['found'] += 1 - targets.remove(target_matched_results) - sequence = entry.get_sequence() - if sequence in used_sequences: - work_summary['duplicates'] += 1 - else: - used_sequences.add(sequence) - print(entry.header, file=output) - print(sequence, file=output) - finally: - output.close() - - logging.info('Completed filtering') - for parm, count in work_summary.items(): - logging.info('%s ==> %d', parm, count) - -if __name__ == "__main__": - main() diff -r 463ebeccb854 -r 8d15aebf55fd tools/filter_by_fasta_ids.xml --- a/tools/filter_by_fasta_ids.xml Fri Sep 26 14:23:16 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - - Extract sequences from a FASTA file based on a list of IDs - filter_by_fasta_ids.py $identifiers $input $output - - - - - - - - - -