Mercurial > repos > bgruening > find_subsequences
comparison find_subsequences.py @ 1:d882a0a75759 draft default tip
Uploaded
author | bgruening |
---|---|
date | Fri, 10 Apr 2015 06:49:30 -0400 |
parents | 7f39014f9404 |
children |
comparison
equal
deleted
inserted
replaced
0:7f39014f9404 | 1:d882a0a75759 |
---|---|
8 from Bio.SeqUtils import nt_search | 8 from Bio.SeqUtils import nt_search |
9 from Bio.Alphabet import generic_dna | 9 from Bio.Alphabet import generic_dna |
10 | 10 |
11 choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb'] | 11 choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb'] |
12 | 12 |
13 def find_pattern(seqs, pattern, outfile_path): | 13 def find_pattern(seqs, pattern, outfile_path, strand): |
14 """ | 14 """ |
15 Finds all occurrences of a pattern in the a given sequence. | 15 Finds all occurrences of a pattern in the a given sequence. |
16 Outputs sequence ID, start and end postion of the pattern. | 16 Outputs sequence ID, start and end postion of the pattern. |
17 """ | 17 """ |
18 pattern = pattern.upper() | 18 pattern = pattern.upper() |
21 if set(pattern).difference(set('ATCG')): | 21 if set(pattern).difference(set('ATCG')): |
22 search_func = complex_pattern_search | 22 search_func = complex_pattern_search |
23 | 23 |
24 with open(outfile_path, 'w+') as outfile: | 24 with open(outfile_path, 'w+') as outfile: |
25 for seq in seqs: | 25 for seq in seqs: |
26 search_func(seq, pattern, outfile) | 26 if strand in ['both', 'forward']: |
27 search_func(seq, rev_compl, outfile, '-') | 27 search_func(seq, pattern, outfile) |
28 if strand in ['both', 'reverse']: | |
29 search_func(seq, rev_compl, outfile, '-') | |
28 | 30 |
29 | 31 |
30 def simple_pattern_search(sequence, pattern, outfile, strand='+'): | 32 def simple_pattern_search(sequence, pattern, outfile, strand='+'): |
31 """ | 33 """ |
32 Simple regular expression search. This is way faster than the complex search. | 34 Simple regular expression search. This is way faster than the complex search. |
48 outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) ) | 50 outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) ) |
49 | 51 |
50 | 52 |
51 if __name__ == "__main__": | 53 if __name__ == "__main__": |
52 parser = argparse.ArgumentParser() | 54 parser = argparse.ArgumentParser() |
53 parser.add_argument('-i', '--input' , required=True) | 55 parser.add_argument('-i', '--input', required=True) |
54 parser.add_argument('-o', '--output' , required=True) | 56 parser.add_argument('-o', '--output', required=True) |
55 parser.add_argument('-p', '--pattern' , required=True) | 57 parser.add_argument('-p', '--pattern', required=True) |
58 parser.add_argument('--strand', choices=['both', 'forward', 'reverse'], default='both') | |
56 parser.add_argument('-f', '--format', default="fasta", choices=choices) | 59 parser.add_argument('-f', '--format', default="fasta", choices=choices) |
57 args = parser.parse_args() | 60 args = parser.parse_args() |
58 | 61 |
59 with open(args.input) as handle: | 62 with open(args.input) as handle: |
60 find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output ) | 63 find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output, args.strand ) |
61 | 64 |