Mercurial > repos > jjohnson > find_in_reference
view find_in_reference.py @ 1:e83e0ce8fb68
Add option to reverse the search, find reference field in input field
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 13 Aug 2014 15:01:33 -0500 |
parents | e7e56b51d156 |
children | c4fd2ea4f988 |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2013, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ """ Takes 2 tabular files as input: 1. The file to be filtered 2. The reference file The string value of selected column of the input file is searched for in the string values of the selected column of the reference file. The intended purpose is to filter a peptide fasta file in tabular format by whether those peptide sequences are found in a reference fasta file. """ import sys,re,os.path import tempfile import optparse from optparse import OptionParser import logging def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option( '-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)' ) parser.add_option( '-r', '--reference', dest='reference', help='The reference file to filter against' ) parser.add_option( '-o', '--output', dest='output', help='The output file for input lines filtered by reference') parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output') parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) parser.add_option( '-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string' ) parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' ) parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' ) parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' ) parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) (options, args) = parser.parse_args() # Input files if options.input != None: try: inputPath = os.path.abspath(options.input) inputFile = open(inputPath, 'r') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(2) else: inputFile = sys.stdin # Reference if options.reference == None: print >> sys.stderr, "failed: reference file is required" exit(2) # Output files outFile = None filteredFile = None if options.filtered == None and options.output == None: #write to stdout outFile = sys.stdout else: if options.output != None: try: outPath = os.path.abspath(options.output) outFile = open(outPath, 'w') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(3) if options.filtered != None: try: filteredPath = os.path.abspath(options.filtered) filteredFile = open(filteredPath, 'w') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(3) incol = -1 if options.input_column and options.input_column > 0: incol = int(options.input_column)-1 refcol = -1 if options.reference_column and options.reference_column > 0: refcol = int(options.reference_column)-1 if options.annotation_columns: annotate = True annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')] else: annotate = False refFile = None num_found = 0 num_novel = 0 for ln,line in enumerate(inputFile): annotations = [] try: found = False search_string = line.split('\t')[incol].rstrip('\r\n') if options.ignore_case: search_string = search_string.upper() if options.debug: print >> sys.stderr, "search: %s" % (search_string) refFile = open(options.reference,'r') for tn,fline in enumerate(refFile): fields = fline.split('\t') target_string = fields[refcol].rstrip('\r\n') if options.ignore_case: target_string = target_string.upper() search = search_string if not options.reverse_find else target_string target = target_string if not options.reverse_find else search_string if options.debug: print >> sys.stderr, "in: %s %s %s" % (search,search in target,target) if search in target: found = True if annotate: annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns]) annotations.append(annotation) else: break if found: num_found += 1 if annotate: line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations)) if options.keep == True: if outFile: outFile.write(line) else: if filteredFile: filteredFile.write(line) else: num_novel += 1 if options.keep == True: if filteredFile: filteredFile.write(line) else: if outFile: outFile.write(line) except Exception, e: print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e) finally: if refFile: refFile.close() print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel) if __name__ == "__main__" : __main__()