annotate profrep_masking.py @ 5:ad3bbf392135 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 11:14:05 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 from Bio import SeqIO
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 from Bio.Seq import MutableSeq
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 from Bio.Alphabet import generic_dna
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 import sys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 QUERY = args.query
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 MODE = args.mode
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 REPEAT_GFF = args.repeat_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 MASKED = args.output_masked
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 repeats_all = get_indices(REPEAT_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 if MODE == "lowercase":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 lower_mask(QUERY, repeats_all, MASKED)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 N_mask(QUERY, repeats_all, MASKED)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 def get_indices(REPEAT_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 Get indices of repeats from GFF file to mask
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 repeats_all = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 with open(REPEAT_GFF, "r") as repeats_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 for line in repeats_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 if not line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 seq_id = line.split("\t")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 start_r = line.split("\t")[3]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 end_r = line.split("\t")[4]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 if seq_id in repeats_all.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 repeats_all[seq_id].append([int(start_r), int(end_r)])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 repeats_all[seq_id] = [[int(start_r), int(end_r)]]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 return repeats_all
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 def lower_mask(QUERY, repeats_all, MASKED):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 for singleSeq in allSeqs:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 for index in repeats_all[singleSeq.id]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 for item in range(index[0] - 1, index[1]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 mutable[item] = mutable[item].lower()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 singleSeq.seq = mutable
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 with open(MASKED, "w") as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 SeqIO.write(allSeqs, handle, 'fasta')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 def N_mask(QUERY, repeats_all, MASKED):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 for singleSeq in allSeqs:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 for index in repeats_all[singleSeq.id]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 for item in range(index[0] - 1, index[1]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61 mutable[item] = "N"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 singleSeq.seq = mutable
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 with open(MASKED, "w") as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 SeqIO.write(allSeqs, handle, 'fasta')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 parser = argparse.ArgumentParser()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 parser.add_argument('-q',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 '--query',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 help='query sequence to be processed')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 parser.add_argument('-rg',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 '--repeat_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 help='query sequence to be processed')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 parser.add_argument('-m',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82 '--mode',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83 default="lowercase",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 choices=['lowercase', 'N'],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 help='query sequence to be processed')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 parser.add_argument('-o',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 '--output_masked',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 default="output_masked",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 help='query sequence to be processed')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 main(args)