annotate gff_selection.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 def check_file_start(gff_file):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 count_comment = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 with open(gff_file, "r") as gff_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 line = gff_all.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10 while line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 line = gff_all.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 count_comment += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 return count_comment
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 def cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 Extract records for particular sequence and/or region from arbitrary GFF3 file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 in form: original_seq_name:start-end (e.g. chr1:1000-2000)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 Write a new GFF containing only records from this region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 If new SEQ ID for extracted region is not provided, it will be named based on the region to cut
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 ! ALLOWS TO CUT ONLY ONE REGION AT A TIME
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 ## cut a particular part of a paritcular sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 if ":" and "-" in REGION:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 seq_to_cut = ":".join(REGION.split(":")[:-1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 int_start = int(REGION.split(":")[-1].split("-")[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 int_end = int(REGION.split(":")[-1].split("-")[1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 ## cut the whole sequence if region is not specified
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 int_start = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 int_end = float("inf")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 seq_to_cut = REGION
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 count_comment = check_file_start(GFF_IN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 with open(GFF_OUT, "w") as gff_out:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 with open(GFF_IN, "r") as gff_in:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 for comment_idx in range(count_comment):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 next(gff_in)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 gff_out.write("##gff-version 3\n")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 gff_out.write("##sequence region {}\n".format(REGION))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41 for line in gff_in:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 if not line.startswith("#") and line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 0] == seq_to_cut and int(float(line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 3])) >= int_start and int(float(line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 4])) <= int_end:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 new_start = int(line.split("\t")[3]) - int_start + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 new_end = int(line.split("\t")[4]) - int_start + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 gff_out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 NEW_SEQ_ID, line.split("\t")[1], line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 2], new_start, new_end, line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 5], line.split("\t")[6], line.split("\t")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 7], line.split("\t")[8]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 GFF_IN = args.gff_input
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 GFF_OUT = args.gff_output
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 REGION = args.region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 NEW_SEQ_ID = args.new_seq_id
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 if GFF_OUT is None:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 GFF_OUT = "{}_cut{}.gff3".format(GFF_IN, REGION)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 if not NEW_SEQ_ID:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 NEW_SEQ_ID = REGION
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 parser = argparse.ArgumentParser()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 parser.add_argument('-gi',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 '--gff_input',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 help='choose gff file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 '-go', '--gff_output',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82 type=str, help='choose gff file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83 parser.add_argument('-si', '--new_seq_id', type=str, help=' ')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 parser.add_argument('-rg', '--region', type=str, required=True, help=' ')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 main(args)