Mercurial > repos > mbernt > fasta_regex_finder
comparison fastaregexfinder.py @ 1:9a811adb714f draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/bioinformatics-cafe commit c318cd3f894f89c13d7eb257678673da70f72212
author | iuc |
---|---|
date | Wed, 25 Jan 2023 14:03:52 +0000 |
parents | 269c627ae9f4 |
children |
comparison
equal
deleted
inserted
replaced
0:269c627ae9f4 | 1:9a811adb714f |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | |
4 import operator | |
3 import re | 5 import re |
4 import sys | 6 import sys |
5 import string | 7 |
6 import argparse | 8 VERSION = "0.1.1" |
7 import operator | 9 |
8 | 10 parser = argparse.ArgumentParser( |
9 VERSION='0.1.1' | 11 description=""" |
10 | |
11 parser = argparse.ArgumentParser(description=""" | |
12 | 12 |
13 DESCRIPTION | 13 DESCRIPTION |
14 | 14 |
15 Search a fasta file for matches to a regex and return a bed file with the | 15 Search a fasta file for matches to a regex and return a bed file with the |
16 coordinates of the match and the matched sequence itself. | 16 coordinates of the match and the matched sequence itself. |
17 | 17 |
18 Output bed file has columns: | 18 Output bed file has columns: |
19 1. Name of fasta sequence (e.g. chromosome) | 19 1. Name of fasta sequence (e.g. chromosome) |
20 2. Start of the match | 20 2. Start of the match |
21 3. End of the match | 21 3. End of the match |
22 4. ID of the match | 22 4. ID of the match |
23 5. Length of the match | 23 5. Length of the match |
24 6. Strand | 24 6. Strand |
25 7. Matched sequence as it appears on the forward strand | 25 7. Matched sequence as it appears on the forward strand |
26 | 26 |
27 For matches on the reverse strand it is reported the start and end position on the | 27 For matches on the reverse strand it is reported the start and end position on the |
28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' | 28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' |
29 present on the reverse strand is reported as ACCCTCCC). | 29 present on the reverse strand is reported as ACCCTCCC). |
30 | 30 |
31 Note: Fasta sequences (chroms) are read in memory one at a time along with the | 31 Note: Fasta sequences (chroms) are read in memory one at a time along with the |
32 matches for that chromosome. | 32 matches for that chromosome. |
33 The order of the output is: chroms as they are found in the inut fasta, matches | 33 The order of the output is: chroms as they are found in the inut fasta, matches |
34 sorted within chroms by positions. | 34 sorted within chroms by positions. |
35 | 35 |
36 EXAMPLE: | 36 EXAMPLE: |
37 ## Test data: | 37 ## Test data: |
38 echo '>mychr' > /tmp/mychr.fa | 38 echo '>mychr' > /tmp/mychr.fa |
39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa | 39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa |
40 | 40 |
41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' | 41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' |
42 mychr 0 4 mychr_0_4_for 4 + ACTG | 42 mychr 0 4 mychr_0_4_for 4 + ACTG |
43 mychr 5 9 mychr_5_9_for 4 + ACTG | 43 mychr 5 9 mychr_5_9_for 4 + ACTG |
44 mychr 10 14 mychr_10_14_for 4 + ACTG | 44 mychr 10 14 mychr_10_14_for 4 + ACTG |
45 | 45 |
46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 | 46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 |
47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4] | 47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4] |
48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4] | 48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4] |
49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4] | 49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4] |
50 | 50 |
51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\w\wGn' | 51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\\w\\wGn' |
52 mychr 0 5 mychr_0_5_for 5 + ACTGn | 52 mychr 0 5 mychr_0_5_for 5 + ACTGn |
53 mychr 5 10 mychr_5_10_for 5 + ACTGn | 53 mychr 5 10 mychr_5_10_for 5 + ACTGn |
54 mychr 10 15 mychr_10_15_for 5 + ACTGn | 54 mychr 10 15 mychr_10_15_for 5 + ACTGn |
55 | 55 |
56 DOWNLOAD | 56 DOWNLOAD |
57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder | 57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder |
58 | 58 |
59 """, formatter_class= argparse.RawTextHelpFormatter) | 59 """, |
60 | 60 formatter_class=argparse.RawTextHelpFormatter, |
61 parser.add_argument('--fasta', '-f', | 61 ) |
62 type= str, | 62 |
63 help='''Input fasta file to search. Use '-' to read the file from stdin. | 63 parser.add_argument( |
64 | 64 "--fasta", |
65 ''', | 65 "-f", |
66 required= True) | 66 type=str, |
67 | 67 help="Input fasta file to search. Use '-' to read the file from stdin.", |
68 parser.add_argument('--regex', '-r', | 68 required=True, |
69 type= str, | 69 ) |
70 help='''Regex to be searched in the fasta input. | 70 |
71 parser.add_argument( | |
72 "--regex", | |
73 "-r", | |
74 type=str, | |
75 help="""Regex to be searched in the fasta input. | |
71 Matches to the reverse complement will have - strand. | 76 Matches to the reverse complement will have - strand. |
72 The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches | 77 The default regex is '([gG]{3,}\\w{1,7}){3,}[gG]{3,}' which searches |
73 for G-quadruplexes. | 78 for G-quadruplexes.""", |
74 ''', | 79 default="([gG]{3,}\\w{1,7}){3,}[gG]{3,}", |
75 default= '([gG]{3,}\w{1,7}){3,}[gG]{3,}') | 80 ) |
76 | 81 |
77 parser.add_argument('--matchcase', '-m', | 82 parser.add_argument( |
78 action= 'store_true', | 83 "--matchcase", |
79 help='''Match case while searching for matches. Default is | 84 "-m", |
85 action="store_true", | |
86 help="""Match case while searching for matches. Default is | |
80 to ignore case (I.e. 'ACTG' will match 'actg'). | 87 to ignore case (I.e. 'ACTG' will match 'actg'). |
81 ''') | 88 """, |
82 | 89 ) |
83 parser.add_argument('--noreverse', | 90 |
84 action= 'store_true', | 91 parser.add_argument( |
85 help='''Do not search the reverse complement of the input fasta. | 92 "--noreverse", |
86 Use this flag to search protein sequences. | 93 action="store_true", |
87 ''') | 94 help="""Do not search the reverse complement of the input fasta. |
88 | 95 Use this flag to search protein sequences.""", |
89 parser.add_argument('--maxstr', | 96 ) |
90 type= int, | 97 |
91 required= False, | 98 parser.add_argument( |
92 default= 10000, | 99 "--maxstr", |
93 help='''Maximum length of the match to report in the 7th column of the output. | 100 type=int, |
101 required=False, | |
102 default=10000, | |
103 help="""Maximum length of the match to report in the 7th column of the output. | |
94 Default is to report up to 10000nt. | 104 Default is to report up to 10000nt. |
95 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] | 105 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] |
96 ''') | 106 """, |
97 | 107 ) |
98 parser.add_argument('--seqnames', '-s', | 108 |
99 type= str, | 109 parser.add_argument( |
100 nargs= '+', | 110 "--seqnames", |
101 default= [None], | 111 "-s", |
102 required= False, | 112 type=str, |
103 help='''List of fasta sequences in --fasta to | 113 nargs="+", |
114 default=[None], | |
115 required=False, | |
116 help="""List of fasta sequences in --fasta to | |
104 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. | 117 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. |
105 Default is to search all the sequences in input. | 118 Default is to search all the sequences in input. |
106 ''') | 119 """, |
107 parser.add_argument('--quiet', '-q', | 120 ) |
108 action= 'store_true', | 121 parser.add_argument( |
109 help='''Do not print progress report (i.e. sequence names as they are scanned). | 122 "--quiet", |
110 ''') | 123 "-q", |
111 | 124 action="store_true", |
112 | 125 help="""Do not print progress report (i.e. sequence names as they are scanned). |
113 | 126 """, |
114 parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION) | 127 ) |
128 | |
129 | |
130 parser.add_argument("--version", "-v", action="version", version="%(prog)s " + VERSION) | |
115 | 131 |
116 | 132 |
117 args = parser.parse_args() | 133 args = parser.parse_args() |
118 | 134 |
119 " --------------------------[ Check and parse arguments ]---------------------- " | 135 " --------------------------[ Check and parse arguments ]---------------------- " |
120 | 136 |
121 if args.matchcase: | 137 if args.matchcase: |
122 flag= 0 | 138 flag = 0 |
123 else: | 139 else: |
124 flag= re.IGNORECASE | 140 flag = re.IGNORECASE |
125 | 141 |
126 " ------------------------------[ Functions ]--------------------------------- " | 142 " ------------------------------[ Functions ]--------------------------------- " |
127 | 143 |
144 | |
128 def sort_table(table, cols): | 145 def sort_table(table, cols): |
129 """ Code to sort list of lists | 146 """Code to sort list of lists |
130 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ | 147 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ |
131 | 148 |
132 sort a table by multiple columns | 149 sort a table by multiple columns |
133 table: a list of lists (or tuple of tuples) where each inner list | 150 table: a list of lists (or tuple of tuples) where each inner list |
134 represents a row | 151 represents a row |
135 cols: a list (or tuple) specifying the column numbers to sort by | 152 cols: a list (or tuple) specifying the column numbers to sort by |
136 e.g. (1,0) would sort by column 1, then by column 0 | 153 e.g. (1,0) would sort by column 1, then by column 0 |
137 """ | 154 """ |
138 for col in reversed(cols): | 155 for col in reversed(cols): |
139 table = sorted(table, key=operator.itemgetter(col)) | 156 table = sorted(table, key=operator.itemgetter(col)) |
140 return(table) | 157 return table |
158 | |
141 | 159 |
142 def trimMatch(x, n): | 160 def trimMatch(x, n): |
143 """ Trim the string x to be at most length n. Trimmed matches will be reported | 161 """Trim the string x to be at most length n. Trimmed matches will be reported |
144 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of | 162 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of |
145 the trimmed strng (e.g 4 here) and b is the full length of the match | 163 the trimmed strng (e.g 4 here) and b is the full length of the match |
146 EXAMPLE: | 164 EXAMPLE: |
147 trimMatch('ACTGNNNN', 4) | 165 trimMatch('ACTGNNNN', 4) |
148 >>>'ACTG[4,8]' | 166 >>>'ACTG[4,8]' |
149 trimMatch('ACTGNNNN', 8) | 167 trimMatch('ACTGNNNN', 8) |
150 >>>'ACTGNNNN' | 168 >>>'ACTGNNNN' |
151 """ | 169 """ |
152 if len(x) > n and n is not None: | 170 if len(x) > n and n is not None: |
153 m= x[0:n] + '[' + str(n) + ',' + str(len(x)) + ']' | 171 m = x[0:n] + "[" + str(n) + "," + str(len(x)) + "]" |
154 else: | 172 else: |
155 m= x | 173 m = x |
156 return(m) | 174 return m |
175 | |
157 | 176 |
158 def revcomp(x): | 177 def revcomp(x): |
159 """Reverse complement string x. Ambiguity codes are handled and case conserved. | 178 """Reverse complement string x. Ambiguity codes are handled and case conserved. |
160 | 179 |
161 Test | 180 Test |
162 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' | 181 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' |
163 revcomp(x) | 182 revcomp(x) |
164 """ | 183 """ |
165 compdict= {'A':'T', | 184 compdict = { |
166 'C':'G', | 185 "A": "T", |
167 'G':'C', | 186 "C": "G", |
168 'T':'A', | 187 "G": "C", |
169 'R':'Y', | 188 "T": "A", |
170 'Y':'R', | 189 "R": "Y", |
171 'S':'W', | 190 "Y": "R", |
172 'W':'S', | 191 "S": "W", |
173 'K':'M', | 192 "W": "S", |
174 'M':'K', | 193 "K": "M", |
175 'B':'V', | 194 "M": "K", |
176 'D':'H', | 195 "B": "V", |
177 'H':'D', | 196 "D": "H", |
178 'V':'B', | 197 "H": "D", |
179 'N':'N', | 198 "V": "B", |
180 'a':'t', | 199 "N": "N", |
181 'c':'g', | 200 "a": "t", |
182 'g':'c', | 201 "c": "g", |
183 't':'a', | 202 "g": "c", |
184 'r':'y', | 203 "t": "a", |
185 'y':'r', | 204 "r": "y", |
186 's':'w', | 205 "y": "r", |
187 'w':'s', | 206 "s": "w", |
188 'k':'m', | 207 "w": "s", |
189 'm':'k', | 208 "k": "m", |
190 'b':'v', | 209 "m": "k", |
191 'd':'h', | 210 "b": "v", |
192 'h':'d', | 211 "d": "h", |
193 'v':'b', | 212 "h": "d", |
194 'n':'n'} | 213 "v": "b", |
195 xrc= [] | 214 "n": "n", |
215 } | |
216 xrc = [] | |
196 for n in x: | 217 for n in x: |
197 xrc.append(compdict[n]) | 218 xrc.append(compdict[n]) |
198 xrc= ''.join(xrc)[::-1] | 219 xrc = "".join(xrc)[::-1] |
199 return(xrc) | 220 return xrc |
221 | |
222 | |
200 # ----------------------------------------------------------------------------- | 223 # ----------------------------------------------------------------------------- |
201 | 224 |
202 psq_re_f= re.compile(args.regex, flags= flag) | 225 psq_re_f = re.compile(args.regex, flags=flag) |
203 ## psq_re_r= re.compile(regexrev) | 226 # psq_re_r= re.compile(regexrev) |
204 | 227 |
205 if args.fasta != '-': | 228 if args.fasta != "-": |
206 ref_seq_fh= open(args.fasta) | 229 ref_seq_fh = open(args.fasta) |
207 else: | 230 else: |
208 ref_seq_fh= sys.stdin | 231 ref_seq_fh = sys.stdin |
209 | 232 |
210 ref_seq=[] | 233 ref_seq = [] |
211 line= (ref_seq_fh.readline()).strip() | 234 line = (ref_seq_fh.readline()).strip() |
212 chr= re.sub('^>', '', line) | 235 chr = re.sub("^>", "", line) |
213 line= (ref_seq_fh.readline()).strip() | 236 line = (ref_seq_fh.readline()).strip() |
214 gquad_list= [] | 237 gquad_list = [] |
215 while True: | 238 while True: |
216 if not args.quiet: | 239 if not args.quiet: |
217 sys.stderr.write('Processing %s\n' %(chr)) | 240 sys.stderr.write("Processing %s\n" % (chr)) |
218 while line.startswith('>') is False: | 241 while line.startswith(">") is False: |
219 ref_seq.append(line) | 242 ref_seq.append(line) |
220 line= (ref_seq_fh.readline()).strip() | 243 line = (ref_seq_fh.readline()).strip() |
221 if line == '': | 244 if line == "": |
222 break | 245 break |
223 ref_seq= ''.join(ref_seq) | 246 ref_seq = "".join(ref_seq) |
224 if args.seqnames == [None] or chr in args.seqnames: | 247 if args.seqnames == [None] or chr in args.seqnames: |
225 for m in re.finditer(psq_re_f, ref_seq): | 248 for m in re.finditer(psq_re_f, ref_seq): |
226 matchstr= trimMatch(m.group(0), args.maxstr) | 249 matchstr = trimMatch(m.group(0), args.maxstr) |
227 quad_id= str(chr) + '_' + str(m.start()) + '_' + str(m.end()) + '_for' | 250 quad_id = str(chr) + "_" + str(m.start()) + "_" + str(m.end()) + "_for" |
228 gquad_list.append([chr, m.start(), m.end(), quad_id, len(m.group(0)), '+', matchstr]) | 251 gquad_list.append( |
252 [chr, m.start(), m.end(), quad_id, len(m.group(0)), "+", matchstr] | |
253 ) | |
229 if args.noreverse is False: | 254 if args.noreverse is False: |
230 ref_seq= revcomp(ref_seq) | 255 ref_seq = revcomp(ref_seq) |
231 seqlen= len(ref_seq) | 256 seqlen = len(ref_seq) |
232 for m in re.finditer(psq_re_f, ref_seq): | 257 for m in re.finditer(psq_re_f, ref_seq): |
233 matchstr= trimMatch(revcomp(m.group(0)), args.maxstr) | 258 matchstr = trimMatch(revcomp(m.group(0)), args.maxstr) |
234 mstart= seqlen - m.end() | 259 mstart = seqlen - m.end() |
235 mend= seqlen - m.start() | 260 mend = seqlen - m.start() |
236 quad_id= str(chr) + '_' + str(mstart) + '_' + str(mend) + '_rev' | 261 quad_id = str(chr) + "_" + str(mstart) + "_" + str(mend) + "_rev" |
237 gquad_list.append([chr, mstart, mend, quad_id, len(m.group(0)), '-', matchstr]) | 262 gquad_list.append( |
238 gquad_sorted= sort_table(gquad_list, (1,2,3)) | 263 [chr, mstart, mend, quad_id, len(m.group(0)), "-", matchstr] |
239 gquad_list= [] | 264 ) |
265 gquad_sorted = sort_table(gquad_list, (1, 2, 3)) | |
266 gquad_list = [] | |
240 for xline in gquad_sorted: | 267 for xline in gquad_sorted: |
241 xline= '\t'.join([str(x) for x in xline]) | 268 xline = "\t".join([str(x) for x in xline]) |
242 print(xline) | 269 print(xline) |
243 chr= re.sub('^>', '', line) | 270 chr = re.sub("^>", "", line) |
244 ref_seq= [] | 271 ref_seq = [] |
245 line= (ref_seq_fh.readline()).strip() | 272 line = (ref_seq_fh.readline()).strip() |
246 if line == '': | 273 if line == "": |
247 break | 274 break |
248 | 275 |
249 #gquad_sorted= sort_table(gquad_list, (0,1,2,3)) | 276 # gquad_sorted= sort_table(gquad_list, (0,1,2,3)) |
250 # | 277 # |
251 #for line in gquad_sorted: | 278 # for line in gquad_sorted: |
252 # line= '\t'.join([str(x) for x in line]) | 279 # line= '\t'.join([str(x) for x in line]) |
253 # print(line) | 280 # print(line) |
254 sys.exit() | 281 sys.exit() |