Mercurial > repos > abims-sbr > gffalign
comparison GFFalign.py @ 0:294f5ba28746 draft
"planemo upload for repository https://github.com/abims-sbr/tools-abims/tools/gffalign commit d8aa0e49353e78e5fd772498a1fcf591e2744f99"
author | abims-sbr |
---|---|
date | Mon, 30 Nov 2020 18:20:46 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:294f5ba28746 |
---|---|
1 #!/usr/bin/python3 | |
2 | |
3 import argparse | |
4 import os | |
5 import gffutils | |
6 from Bio import SeqIO | |
7 | |
8 | |
9 class GeneComp: | |
10 # This cass aims to discover the position of the genes in the alignment | |
11 # and to extract informations on the genes characteristics | |
12 | |
13 def __init__(self, ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr, output=None): | |
14 # d* / ot* are the db result, q* / cc the query | |
15 # gene are the gene from maf-tab file | |
16 # start, end are the coordinates ''' | |
17 self.otbeg = int(otgene.start) - dstart | |
18 self.otend = int(otgene.end) - dstart | |
19 self.ccbeg = int(ccgene.start) - qstart | |
20 self.ccend = int(ccgene.end) - qstart | |
21 self.tol = tol # how much tolerance in nucleotides | |
22 self.ccname = ccgene.chrom | |
23 self.otname = otgene.chrom | |
24 self.q_outstart = qstart+self.otbeg | |
25 self.q_outend = qstart+self.otend | |
26 self.genelist = [] | |
27 self.qstart = qstart | |
28 if extract: | |
29 self.fastafile = extract[0] | |
30 self.fastaout = extract[1] | |
31 self.output = output | |
32 self.out = "" | |
33 self.gattr = f"New_annotation='{gattr}'" | |
34 | |
35 def is_equal(self): | |
36 if (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol): | |
37 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=confirmed" | |
38 return self.out | |
39 | |
40 def is_shorter(self): | |
41 if (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) > self.ccend: | |
42 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter" | |
43 return self.out | |
44 elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) > self.ccend: | |
45 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_right" | |
46 return self.out | |
47 elif (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol): | |
48 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_left" | |
49 return self.out | |
50 | |
51 def is_longer(self): | |
52 if (self.otbeg - self.tol) > self.ccbeg and (self.otend + self.tol) < self.ccend: | |
53 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer" | |
54 return self.out | |
55 elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend + self.tol) < self.ccend: | |
56 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_right" | |
57 return self.out | |
58 elif (self.otbeg - self.tol) > self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol): | |
59 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_left" | |
60 return self.out | |
61 | |
62 def is_offset(self): | |
63 if (self.otbeg + self.tol) < self.ccbeg < (self.otend - self.tol) and (self.otend + self.tol) < self.ccend: | |
64 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_right" | |
65 return self.out | |
66 if (self.otbeg - self.tol) > self.ccbeg and (self.otbeg - self.tol) < self.ccend < self.otend + self.tol: | |
67 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_left" | |
68 return self.out | |
69 | |
70 def is_different(self): | |
71 if self.otbeg - self.tol > self.ccend or self.otend + self.tol < self.otbeg: | |
72 self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=new" | |
73 return self.out | |
74 | |
75 def extract_fasta(self): | |
76 with open(self.fastaout,"a") as fileout: | |
77 with open(self.fastafile) as filefasta: | |
78 for record in SeqIO.parse(filefasta,"fasta"): | |
79 if record.id == self.ccname: | |
80 fileout.write(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}\n") | |
81 | |
82 # def extract_stout(self): | |
83 # with open(self.fastafile) as filefasta: | |
84 # for record in SeqIO.parse(filefasta,"fasta"): | |
85 # if record.id == self.ccname: | |
86 # print(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}") | |
87 | |
88 | |
89 #def __str__(self): | |
90 # return f"{self.ccname}\t{self.q_outstart}\t{self.q_outend}" | |
91 | |
92 | |
93 def fout(self): | |
94 try: | |
95 if self.__class__.fout.called: | |
96 with open(self.output,"a") as fileout: | |
97 fileout.write(self.out+"\n") | |
98 except AttributeError: | |
99 try: | |
100 if os.path.isfile(self.output): | |
101 os.remove(self.output) | |
102 with open(self.output,"a") as fileout: | |
103 #fileout.write("##gff-version 3\n") | |
104 fileout.write(f"##gff-version 3\n{self.out}\n") | |
105 self.__class__.fout.called = True | |
106 self.__class__.fout(self) | |
107 except TypeError: | |
108 pass | |
109 | |
110 | |
111 def diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db): | |
112 # are the two genes the same? | |
113 if args.extract: | |
114 extract = args.extract | |
115 else: | |
116 extract = None | |
117 | |
118 if args.tollerance: | |
119 tol = args.tollerance | |
120 else: | |
121 tol = 30 | |
122 | |
123 | |
124 for otgene in target_genes: | |
125 # gattr is a variable created to store the the annotation of the target gene. | |
126 # It will be use to suggest a functional annotation | |
127 gattr = str(otgene).split("\t")[-1] | |
128 for ccgene in query_genes: | |
129 algene = GeneComp(ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr) | |
130 if "new" in args.verbosity or "all" in args.verbosity: | |
131 if algene.is_different(): | |
132 algene_out = algene.is_different() | |
133 algene_name = algene_out.split("\t")[0] | |
134 algene_start = int(algene_out.split("\t")[3]) | |
135 algene_end = int(algene_out.split("\t")[4]) | |
136 #print(al_name, al_start, al_end) | |
137 #print(list(target_db.region(region=(al_name, al_start, al_end), completely_within=True))) | |
138 if not list(query_db.region(region=(algene_name, algene_start, algene_end), completely_within=False)): | |
139 if args.output: | |
140 algene.output=args.output | |
141 algene.fout() | |
142 else: | |
143 print(algene.is_different()) | |
144 if args.extract: | |
145 algene.extract_fasta() | |
146 | |
147 if "shorter" in args.verbosity or "all" in args.verbosity: | |
148 if algene.is_shorter(): | |
149 if args.output: | |
150 algene.output=args.output | |
151 algene.fout() | |
152 else: | |
153 print(algene.is_shorter()) | |
154 if args.extract: | |
155 algene.extract_fasta() | |
156 if "longer" in args.verbosity or "all" in args.verbosity: | |
157 if algene.is_longer(): | |
158 if args.output: | |
159 algene.output=args.output | |
160 algene.fout() | |
161 else: | |
162 print(algene.is_longer()) | |
163 if args.extract: | |
164 algene.extract_fasta() | |
165 if "offset" in args.verbosity or "all" in args.verbosity: | |
166 if algene.is_offset(): | |
167 if args.output: | |
168 algene.output=args.output | |
169 algene.fout() | |
170 else: | |
171 print(algene.is_offset()) | |
172 if args.extract: | |
173 algene.extract_fasta() | |
174 if "confirmed" in args.verbosity: | |
175 if algene.is_equal(): | |
176 if args.output: | |
177 algene.output=args.output | |
178 algene.fout() | |
179 else: | |
180 print(algene.is_equal()) | |
181 if args.extract: | |
182 algene.extract_fasta() | |
183 | |
184 | |
185 def gff_gene_check(GffName, db_name, memory=0): | |
186 # The IDs on the GFFs must be unique. Moreover, because only the gene | |
187 # information are needed, all the other information must be removed | |
188 # from the GFFs. | |
189 tempgff="" | |
190 for line in open(GffName): | |
191 if line[0] != "#": | |
192 if line.split()[2] == "gene": | |
193 tempgff+=line | |
194 | |
195 else: | |
196 tempgff+=line | |
197 | |
198 if memory: | |
199 # Write the db in memory and return it as variable so it can be used | |
200 # as subclass of _DBCreator | |
201 dbout = gffutils.create_db(tempgff, ":memory:", from_string=True) | |
202 return dbout | |
203 else: | |
204 gffutils.create_db(tempgff, db_name, from_string=True) | |
205 | |
206 def check_strand(start,leng,strand): | |
207 if strand == "+": | |
208 end = start+leng | |
209 else: | |
210 end = start-leng | |
211 start,end = end,start | |
212 return(start,end) | |
213 | |
214 def check_position(line, query_db,target_db): | |
215 # check if there is a gene in the aligned area | |
216 # lets' start with the coordinatescharacteristics | |
217 elsp = line.split('\t') | |
218 dname = elsp[1] | |
219 dstart = int(elsp[2]) | |
220 dlen = int(elsp[3]) | |
221 dstrand = elsp[4] | |
222 qname = elsp[6] | |
223 qstart = int(elsp[7]) | |
224 qlen = int(elsp[8]) | |
225 qstrand = elsp[9] | |
226 | |
227 # check the strand, if - reverse the start and the end | |
228 qstart,qend = check_strand(qstart,qlen,qstrand) | |
229 dstart,dend = check_strand(dstart,dlen,dstrand) | |
230 | |
231 #counter | |
232 d_counter = 0 | |
233 | |
234 # lists of gene within the coordinates | |
235 target_genes = [ gene for gene in list(target_db.region(region=(dname, dstart, dend), completely_within=True))] | |
236 query_genes = [ gene for gene in list(query_db.region(region=(qname, qstart, qend), completely_within=True))] | |
237 | |
238 if len(target_genes): | |
239 # and len(query_genes): | |
240 ###### | |
241 # PER PROVARE HO TOLTO QUESTO, DA CONTROLLARE | |
242 ###### | |
243 #if len(target_genes) >= len(query_genes): # the number of genes in the aligned area must be bigger in the target (?) | |
244 diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db) | |
245 | |
246 | |
247 if d_counter>=1: | |
248 return(d_counter) | |
249 else: | |
250 return(0) | |
251 | |
252 | |
253 def main(args): | |
254 fcounter = 0 | |
255 # import the GFF library and create (has to be done only once) | |
256 # the GFF databases | |
257 target_db = query_db = None | |
258 db_query_name=args.queryGff + "_db" | |
259 db_target_name=args.targetGff + "_db" | |
260 | |
261 if args.force_database: | |
262 # remove the database if required by user | |
263 os.remove(db_query_name) | |
264 os.remove(db_target_name) | |
265 | |
266 if args.use_query_database: | |
267 # use the db passed by the user | |
268 query_db = gffutils.FeatureDB(args.use_query_database) | |
269 else: | |
270 #create the db | |
271 #gffutils.create_db(args.queryGff, db_query_name) | |
272 qdbout = gff_gene_check(args.queryGff, db_query_name, args.memory) | |
273 | |
274 if args.memory: | |
275 query_db = gffutils.FeatureDB(qdbout.dbfn) | |
276 else: | |
277 query_db = gffutils.FeatureDB(db_query_name) | |
278 # except ValueError: | |
279 # print("Please check your GFF file") | |
280 # except: | |
281 # raise Exception(f"It seems you already have a db called {db_query_name}. Use the -qu if you want to use it or delete the") | |
282 | |
283 | |
284 if args.use_target_database: | |
285 target_db = gffutils.FeatureDB(args.use_target_database) | |
286 else: | |
287 tdbout = gff_gene_check(args.targetGff, db_target_name, args.memory) | |
288 if args.memory: | |
289 target_db = gffutils.FeatureDB(tdbout.dbfn) | |
290 else: | |
291 target_db = gffutils.FeatureDB(db_target_name) | |
292 # except ValueError: | |
293 # print("Please check your GFF file")if "all" in args.verbosity: | |
294 # except: | |
295 # raise Exception(f"It seems you already have a db called {db_target_name}. Use the -du if you want to use it or delete the") | |
296 | |
297 # put the content of the DB in objects | |
298 with open(args.aln) as maftab: | |
299 for line in maftab: | |
300 if line[0] != "#": | |
301 fcounter += check_position(line, query_db, target_db) | |
302 | |
303 if __name__ == '__main__': | |
304 parser = argparse.ArgumentParser(description = '''Tool to extract genes coordinates from a whole genome alignent. | |
305 This script needs an alignement in TAB format and two gff files''') | |
306 parser.add_argument('aln', help='alignment file in TAB format. The suggested way to obtain it is to run Last and\ | |
307 than convert the file from MAF to TAB with maf-convert') | |
308 parser.add_argument('queryGff', | |
309 help='''Gff file of the query organism. The gene IDs in the GFF must be unique.''') | |
310 parser.add_argument('targetGff', | |
311 help='''Gff file of the "target" organism. The gene IDs in the GFF must be unique.''') | |
312 parser.add_argument("-uq", "--use-query-database", | |
313 help='''Use this parament if you already have a query gffutils formatted database or | |
314 if it\'s not the first time you run this script''', type=str) | |
315 parser.add_argument("-ut", "--use-target-database", | |
316 help='''Use this parament if you already have a target gffutils formatted database or | |
317 if it\'s not the first time you run this script''', type=str) | |
318 parser.add_argument("-fd", "--force-database", help="delete old gffutils databases and create new ones", action='store_true') | |
319 parser.add_argument("-m", "--memory", help='''create an in-memory database. This option can't be used with the other DB options. | |
320 probably usefull in Galaxy integration''', action='store_true' ) | |
321 parser.add_argument("-e", "--extract", | |
322 help='''Extract the fasta sequence of the new suggested gene. It takes two argument: the fasta file | |
323 of the genome and the name of the output file. This will slow down the process A LOT.''', nargs=2, type=str) | |
324 #parser.add_argument("-es", "--extract_stout", help='Like -e but it will print the result in the standard output. FASTER than -e. It need the fasta file', type=str) | |
325 parser.add_argument("-o", "--output", help='Name of the output file. Default output is stout', type=str) | |
326 parser.add_argument("-t", "--tollerance", help='Interval, in nucleotide, within a gene is considered in the same position. Default 30', default=30, type=int) | |
327 parser.add_argument("-v", "--verbosity", | |
328 help='''Output options. If not specify the software shows only the genes that are in the exact position of the | |
329 genes in the target. It\'s possible to show annotated genes that are in aligned regions but that have different lengths or in slightly | |
330 different positions. It's possible to select multiple, space separated, values.''', | |
331 choices=["all","shorter", "longer", "offset", "new", "confirmed"], nargs='*', default='new', type=str) | |
332 #metavar=('all','shorter','longer','shifted','new'), | |
333 #action='append', | |
334 #choices=["all","shorter", "longer", "shifted", "new"], default="new") | |
335 parser.add_argument('--version', action='version', version='0.1.0') | |
336 args = parser.parse_args() | |
337 main(args) |