annotate find_in_reference.py @ 2:c4fd2ea4f988

Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
author Jim Johnson <jj@umn.edu>
date Thu, 13 Nov 2014 14:09:50 -0600
parents e83e0ce8fb68
children 2429b413d90a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
2 """
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
3 #
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
4 #------------------------------------------------------------------------------
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
5 # University of Minnesota
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
6 # Copyright 2013, Regents of the University of Minnesota
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
7 #------------------------------------------------------------------------------
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
8 # Author:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
9 #
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
10 # James E Johnson
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
11 #
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
12 #------------------------------------------------------------------------------
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
13 """
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
14
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
15 """
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
16 Takes 2 tabular files as input:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
17 1. The file to be filtered
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
18 2. The reference file
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
19
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
20 The string value of selected column of the input file is searched for
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
21 in the string values of the selected column of the reference file.
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
22
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
23 The intended purpose is to filter a peptide fasta file in tabular format
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
24 by whether those peptide sequences are found in a reference fasta file.
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
25
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
26 """
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
27 import sys,re,os.path
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
28 import tempfile
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
29 import optparse
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
30 from optparse import OptionParser
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
31 import logging
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
32
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
33
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
34 def __main__():
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
35 #Parse Command Line
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
36 parser = optparse.OptionParser()
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
37 parser.add_option( '-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
38 parser.add_option( '-r', '--reference', dest='reference', help='The reference file to filter against' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
39 parser.add_option( '-o', '--output', dest='output', help='The output file for input lines filtered by reference')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' )
1
e83e0ce8fb68 Add option to reverse the search, find reference field in input field
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
44 parser.add_option( '-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string' )
2
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
45 parser.add_option( '-B', '--test_reverse', dest='test_reverse', action="store_true", default=False, help='Also search for reversed input string in reference' )
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
46 parser.add_option( '-D', '--test_dna_reverse_complement', dest='test_reverse_comp', action="store_true", default=False, help='Also search for the DNA reverse complement of input string' )
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
47 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
48 parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
49 parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
50 parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
51 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
52 (options, args) = parser.parse_args()
2
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
53
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
54 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1])
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
55 def test_rcomplement(seq, target):
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
56 if options.test_reverse_comp:
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
57 try:
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
58 comp = revcompl(seq)
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
59 return comp in target
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
60 except:
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
61 pass
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
62 return False
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
63
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
64 def test_reverse(seq,target):
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
65 return options.test_reverse and seq and seq[::-1] in target
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
66
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
67 # Input files
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
68 if options.input != None:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
69 try:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
70 inputPath = os.path.abspath(options.input)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
71 inputFile = open(inputPath, 'r')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
72 except Exception, e:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
73 print >> sys.stderr, "failed: %s" % e
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
74 exit(2)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
75 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
76 inputFile = sys.stdin
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
77 # Reference
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
78 if options.reference == None:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
79 print >> sys.stderr, "failed: reference file is required"
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
80 exit(2)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
81 # Output files
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
82 outFile = None
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
83 filteredFile = None
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
84 if options.filtered == None and options.output == None:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
85 #write to stdout
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
86 outFile = sys.stdout
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
87 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
88 if options.output != None:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
89 try:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
90 outPath = os.path.abspath(options.output)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
91 outFile = open(outPath, 'w')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
92 except Exception, e:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
93 print >> sys.stderr, "failed: %s" % e
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
94 exit(3)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
95 if options.filtered != None:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
96 try:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
97 filteredPath = os.path.abspath(options.filtered)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
98 filteredFile = open(filteredPath, 'w')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
99 except Exception, e:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
100 print >> sys.stderr, "failed: %s" % e
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
101 exit(3)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
102 incol = -1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
103 if options.input_column and options.input_column > 0:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
104 incol = int(options.input_column)-1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
105 refcol = -1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
106 if options.reference_column and options.reference_column > 0:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
107 refcol = int(options.reference_column)-1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
108 if options.annotation_columns:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
109 annotate = True
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
110 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')]
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
111 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
112 annotate = False
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
113 refFile = None
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
114 num_found = 0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
115 num_novel = 0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
116 for ln,line in enumerate(inputFile):
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
117 annotations = []
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
118 try:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
119 found = False
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
120 search_string = line.split('\t')[incol].rstrip('\r\n')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
121 if options.ignore_case:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
122 search_string = search_string.upper()
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
123 if options.debug:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
124 print >> sys.stderr, "search: %s" % (search_string)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
125 refFile = open(options.reference,'r')
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
126 for tn,fline in enumerate(refFile):
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
127 fields = fline.split('\t')
1
e83e0ce8fb68 Add option to reverse the search, find reference field in input field
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
128 target_string = fields[refcol].rstrip('\r\n')
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
129 if options.ignore_case:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
130 target_string = target_string.upper()
1
e83e0ce8fb68 Add option to reverse the search, find reference field in input field
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
131 search = search_string if not options.reverse_find else target_string
e83e0ce8fb68 Add option to reverse the search, find reference field in input field
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
132 target = target_string if not options.reverse_find else search_string
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
133 if options.debug:
1
e83e0ce8fb68 Add option to reverse the search, find reference field in input field
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
134 print >> sys.stderr, "in: %s %s %s" % (search,search in target,target)
2
c4fd2ea4f988 Add the option to test the reversed sequence and the DNA reverse complement of the sequence (ignored if the sequence cannot be interpreted as DNA)
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
135 if search in target or test_reverse(search,target) or test_rcomplement(search,target):
0
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
136 found = True
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
137 if annotate:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
138 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
139 annotations.append(annotation)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
140 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
141 break
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
142 if found:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
143 num_found += 1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
144 if annotate:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
145 line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations))
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
146 if options.keep == True:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
147 if outFile:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
148 outFile.write(line)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
149 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
150 if filteredFile:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
151 filteredFile.write(line)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
152 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
153 num_novel += 1
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
154 if options.keep == True:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
155 if filteredFile:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
156 filteredFile.write(line)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
157 else:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
158 if outFile:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
159 outFile.write(line)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
160 except Exception, e:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
161 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
162 finally:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
163 if refFile:
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
164 refFile.close()
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
165 print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel)
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
166
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
167 if __name__ == "__main__" : __main__()
e7e56b51d156 Uploaded
jjohnson
parents:
diff changeset
168