annotate RaGOO/Assemblytics_within_alignment.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 #! /usr/bin/env python
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2 import argparse
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 import gzip
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5 # Author: Maria Nattestad
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6 # Email: mnattest@cshl.edu
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10 def run(args):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11 filename = args.delta
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 minimum_variant_size = args.minimum_variant_size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14 f = open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15 header1 = f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16 if header1[0:2]=="\x1f\x8b":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 f = gzip.open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 header1 = f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 # Ignore the first two lines for now
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 linecounter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 current_reference_name = ""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27 current_reference_position = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 current_query_name = ""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 current_query_position = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 variants = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 if line[0]==">":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 # linecounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 # if linecounter > 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 # break
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40 current_reference_name = fields[0][1:]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 current_query_name = fields[1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 if len(fields) > 4:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45 # current_reference_position = int(fields[0])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 current_reference_position = min(int(fields[0]),int(fields[1]))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47 # fields[1] is the reference position at the end of the alignment
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48 # current_query_position = int(fields[2])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 current_query_position = min(int(fields[2]),int(fields[3]))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 # fields[3] is the query position at the end of the alignment
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 tick = int(fields[0])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 report = variants[-1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55 report[4] = report[4] + 1 # size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 if tick > 0: # deletion, moves in reference
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 report[2] = report[2] + 1 # reference end position
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 report[7] = report[7] + 1 # reference gap size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 current_reference_position += 1 # update reference position after deletion
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60 elif tick < 0: # insertion, moves in query
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61 report[8] = report[8] + 1 # query gap size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 report[12] = report[12] + 1 # query end position
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 current_query_position += 1 # update query position after insertion
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64 else: # report the last one and continue
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 current_reference_position += abs(tick) - 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 current_query_position += abs(tick) - 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 if tick > 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68 size = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 current_reference_position += size # update reference position after deletion
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 variants.append(report)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 elif tick < 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74 size = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77 current_query_position += size # update query position after insertion
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78 variants.append(report)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 # TESTING
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80 # print line, report
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85 newcounter = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 for line in variants:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88 if line[4] >= minimum_variant_size:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89 line[3] = "Assemblytics_w_%d" % (newcounter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91 # print "\t".join(map(str,line))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92 newcounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95 def main():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96 parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 parser.set_defaults(func=run)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 args=parser.parse_args()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 args.func(args)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103 if __name__=="__main__":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104 main()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105