comparison find_subsequences.py @ 1:d882a0a75759 draft default tip

Uploaded
author bgruening
date Fri, 10 Apr 2015 06:49:30 -0400
parents 7f39014f9404
children
comparison
equal deleted inserted replaced
0:7f39014f9404 1:d882a0a75759
8 from Bio.SeqUtils import nt_search 8 from Bio.SeqUtils import nt_search
9 from Bio.Alphabet import generic_dna 9 from Bio.Alphabet import generic_dna
10 10
11 choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb'] 11 choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb']
12 12
13 def find_pattern(seqs, pattern, outfile_path): 13 def find_pattern(seqs, pattern, outfile_path, strand):
14 """ 14 """
15 Finds all occurrences of a pattern in the a given sequence. 15 Finds all occurrences of a pattern in the a given sequence.
16 Outputs sequence ID, start and end postion of the pattern. 16 Outputs sequence ID, start and end postion of the pattern.
17 """ 17 """
18 pattern = pattern.upper() 18 pattern = pattern.upper()
21 if set(pattern).difference(set('ATCG')): 21 if set(pattern).difference(set('ATCG')):
22 search_func = complex_pattern_search 22 search_func = complex_pattern_search
23 23
24 with open(outfile_path, 'w+') as outfile: 24 with open(outfile_path, 'w+') as outfile:
25 for seq in seqs: 25 for seq in seqs:
26 search_func(seq, pattern, outfile) 26 if strand in ['both', 'forward']:
27 search_func(seq, rev_compl, outfile, '-') 27 search_func(seq, pattern, outfile)
28 if strand in ['both', 'reverse']:
29 search_func(seq, rev_compl, outfile, '-')
28 30
29 31
30 def simple_pattern_search(sequence, pattern, outfile, strand='+'): 32 def simple_pattern_search(sequence, pattern, outfile, strand='+'):
31 """ 33 """
32 Simple regular expression search. This is way faster than the complex search. 34 Simple regular expression search. This is way faster than the complex search.
48 outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) ) 50 outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) )
49 51
50 52
51 if __name__ == "__main__": 53 if __name__ == "__main__":
52 parser = argparse.ArgumentParser() 54 parser = argparse.ArgumentParser()
53 parser.add_argument('-i', '--input' , required=True) 55 parser.add_argument('-i', '--input', required=True)
54 parser.add_argument('-o', '--output' , required=True) 56 parser.add_argument('-o', '--output', required=True)
55 parser.add_argument('-p', '--pattern' , required=True) 57 parser.add_argument('-p', '--pattern', required=True)
58 parser.add_argument('--strand', choices=['both', 'forward', 'reverse'], default='both')
56 parser.add_argument('-f', '--format', default="fasta", choices=choices) 59 parser.add_argument('-f', '--format', default="fasta", choices=choices)
57 args = parser.parse_args() 60 args = parser.parse_args()
58 61
59 with open(args.input) as handle: 62 with open(args.input) as handle:
60 find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output ) 63 find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output, args.strand )
61 64