Mercurial > repos > bgruening > bionano_scaffold
comparison remove_fake_cut_sites.py @ 4:8cc3862f8b8e draft
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author | bgruening |
---|---|
date | Tue, 25 May 2021 20:12:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:295c0e28f4ee | 4:8cc3862f8b8e |
---|---|
1 import re | |
2 import sys | |
3 | |
4 from Bio import SeqIO | |
5 from Bio.Seq import Seq | |
6 | |
7 | |
8 def main(): | |
9 | |
10 fasta_file = sys.argv[1] | |
11 output_file = sys.argv[2] | |
12 log_file = sys.argv[3] | |
13 | |
14 output_handle = open(output_file, "w") | |
15 log_handle = open(log_file, "w") | |
16 | |
17 with open(fasta_file, "r") as fasta_input_handle: | |
18 for record in SeqIO.parse(fasta_input_handle, "fasta"): | |
19 | |
20 change_count = 0 | |
21 cut_sites = [ | |
22 Seq("CTTAAG"), | |
23 Seq("CTTCTCG"), | |
24 Seq("GCTCTTC"), | |
25 Seq("CCTCAGC"), | |
26 Seq("GAATGC"), | |
27 Seq("GCAATG"), | |
28 Seq("ATCGAT"), | |
29 Seq("CACGAG"), | |
30 ] | |
31 | |
32 for cut_site in cut_sites: | |
33 cut_site_both_orientations = (cut_site, cut_site.reverse_complement()) | |
34 | |
35 for cut_site_for_orientation in cut_site_both_orientations: | |
36 | |
37 n_flank_length = 1 | |
38 search_pattern = ( | |
39 "N" * n_flank_length | |
40 + str(cut_site_for_orientation) | |
41 + "N" * n_flank_length | |
42 ) | |
43 replacement = "N" * ( | |
44 n_flank_length * 2 + len(cut_site_for_orientation) | |
45 ) | |
46 | |
47 (new_string, changes) = re.subn( | |
48 search_pattern, | |
49 replacement, | |
50 str(record.seq.upper()), | |
51 flags=re.IGNORECASE, | |
52 ) | |
53 change_count += changes | |
54 | |
55 record.seq = Seq(new_string) | |
56 | |
57 if change_count > 0: | |
58 log_handle.write( | |
59 " ".join([record.id, ":", str(change_count), "changes\n"]) | |
60 ) | |
61 SeqIO.write([record], output_handle, "fasta") | |
62 | |
63 # Finally, count the matches | |
64 possible_fake_cut_sites = re.findall( | |
65 "N[^N]{1,10}N", str(record.seq.upper()) | |
66 ) | |
67 if len(possible_fake_cut_sites) > 0: | |
68 log_handle.write( | |
69 " ".join( | |
70 [ | |
71 record.id, | |
72 ":", | |
73 str(len(possible_fake_cut_sites)), | |
74 "possible non-standard fake cut sites\n", | |
75 ] | |
76 ) | |
77 ) | |
78 | |
79 output_handle.close() | |
80 log_handle.close() | |
81 | |
82 | |
83 if __name__ == "__main__": | |
84 main() |