Mercurial > repos > bgruening > bionano_scaffold
annotate remove_fake_cut_sites.py @ 4:8cc3862f8b8e draft
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author | bgruening |
---|---|
date | Tue, 25 May 2021 20:12:52 +0000 |
parents | |
children |
rev | line source |
---|---|
4
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
1 import re |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
2 import sys |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
3 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
4 from Bio import SeqIO |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
5 from Bio.Seq import Seq |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
6 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
7 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
8 def main(): |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
9 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
10 fasta_file = sys.argv[1] |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
11 output_file = sys.argv[2] |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
12 log_file = sys.argv[3] |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
13 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
14 output_handle = open(output_file, "w") |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
15 log_handle = open(log_file, "w") |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
16 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
17 with open(fasta_file, "r") as fasta_input_handle: |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
18 for record in SeqIO.parse(fasta_input_handle, "fasta"): |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
19 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
20 change_count = 0 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
21 cut_sites = [ |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
22 Seq("CTTAAG"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
23 Seq("CTTCTCG"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
24 Seq("GCTCTTC"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
25 Seq("CCTCAGC"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
26 Seq("GAATGC"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
27 Seq("GCAATG"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
28 Seq("ATCGAT"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
29 Seq("CACGAG"), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
30 ] |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
31 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
32 for cut_site in cut_sites: |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
33 cut_site_both_orientations = (cut_site, cut_site.reverse_complement()) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
34 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
35 for cut_site_for_orientation in cut_site_both_orientations: |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
36 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
37 n_flank_length = 1 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
38 search_pattern = ( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
39 "N" * n_flank_length |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
40 + str(cut_site_for_orientation) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
41 + "N" * n_flank_length |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
42 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
43 replacement = "N" * ( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
44 n_flank_length * 2 + len(cut_site_for_orientation) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
45 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
46 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
47 (new_string, changes) = re.subn( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
48 search_pattern, |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
49 replacement, |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
50 str(record.seq.upper()), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
51 flags=re.IGNORECASE, |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
52 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
53 change_count += changes |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
54 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
55 record.seq = Seq(new_string) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
56 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
57 if change_count > 0: |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
58 log_handle.write( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
59 " ".join([record.id, ":", str(change_count), "changes\n"]) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
60 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
61 SeqIO.write([record], output_handle, "fasta") |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
62 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
63 # Finally, count the matches |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
64 possible_fake_cut_sites = re.findall( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
65 "N[^N]{1,10}N", str(record.seq.upper()) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
66 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
67 if len(possible_fake_cut_sites) > 0: |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
68 log_handle.write( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
69 " ".join( |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
70 [ |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
71 record.id, |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
72 ":", |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
73 str(len(possible_fake_cut_sites)), |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
74 "possible non-standard fake cut sites\n", |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
75 ] |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
76 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
77 ) |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
78 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
79 output_handle.close() |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
80 log_handle.close() |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
81 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
82 |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
83 if __name__ == "__main__": |
8cc3862f8b8e
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff
changeset
|
84 main() |