diff GFFalign.py @ 0:294f5ba28746 draft

"planemo upload for repository https://github.com/abims-sbr/tools-abims/tools/gffalign commit d8aa0e49353e78e5fd772498a1fcf591e2744f99"
author abims-sbr
date Mon, 30 Nov 2020 18:20:46 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GFFalign.py	Mon Nov 30 18:20:46 2020 +0000
@@ -0,0 +1,337 @@
+#!/usr/bin/python3 
+
+import argparse
+import os
+import gffutils
+from Bio import SeqIO
+
+
+class GeneComp:
+    # This cass aims to discover the position of the genes in the alignment
+    # and to extract informations on the genes characteristics
+
+    def __init__(self, ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr, output=None):
+        # d* / ot* are the db result, q*  / cc the query
+        # gene are the gene from maf-tab file 
+        # start, end are the coordinates  '''
+        self.otbeg = int(otgene.start) - dstart
+        self.otend = int(otgene.end) - dstart
+        self.ccbeg = int(ccgene.start) - qstart
+        self.ccend = int(ccgene.end) - qstart
+        self.tol = tol # how much tolerance in nucleotides
+        self.ccname = ccgene.chrom
+        self.otname = otgene.chrom
+        self.q_outstart = qstart+self.otbeg
+        self.q_outend = qstart+self.otend
+        self.genelist = []
+        self.qstart = qstart
+        if extract:
+            self.fastafile = extract[0]
+            self.fastaout = extract[1]
+        self.output = output
+        self.out = ""
+        self.gattr = f"New_annotation='{gattr}'"
+
+    def is_equal(self):
+        if (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=confirmed"
+            return self.out
+
+    def is_shorter(self):
+        if (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) > self.ccend:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter"
+            return self.out
+        elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend - self.tol) > self.ccend:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_right"
+            return self.out
+        elif (self.otbeg + self.tol) < self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=shorter_left"
+            return self.out
+
+    def is_longer(self):
+        if (self.otbeg - self.tol) > self.ccbeg and (self.otend + self.tol) < self.ccend:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer"
+            return self.out
+        elif (self.otbeg - self.tol) <= self.ccbeg <= (self.otbeg + self.tol) and (self.otend + self.tol) < self.ccend:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_right"
+            return self.out
+        elif (self.otbeg - self.tol) > self.ccbeg and (self.otend - self.tol) <= self.ccend <= (self.otend + self.tol):
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=longer_left"
+            return self.out
+
+    def is_offset(self):
+        if (self.otbeg + self.tol) < self.ccbeg < (self.otend - self.tol) and (self.otend + self.tol) < self.ccend:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_right"
+            return self.out
+        if (self.otbeg - self.tol) > self.ccbeg and (self.otbeg - self.tol) < self.ccend < self.otend + self.tol:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=offset_left"
+            return self.out
+
+    def is_different(self):
+        if self.otbeg - self.tol > self.ccend or self.otend + self.tol < self.otbeg:
+            self.out = f"{self.ccname}\tprediction\tgene\t{self.q_outstart}\t{self.q_outend}\t.\t{self.qstart}\t.\tID={self.ccname};{self.gattr};Note=new"
+            return self.out
+
+    def extract_fasta(self):
+        with open(self.fastaout,"a") as fileout:
+            with open(self.fastafile) as filefasta:
+                for record in SeqIO.parse(filefasta,"fasta"):
+                    if record.id == self.ccname:
+                        fileout.write(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}\n")
+
+    # def extract_stout(self):
+    #     with open(self.fastafile) as filefasta:
+    #             for record in SeqIO.parse(filefasta,"fasta"):
+    #                 if record.id == self.ccname:
+    #                     print(f">{self.ccname}_{self.q_outstart}:{self.q_outend}\n{record.seq[self.q_outstart:self.q_outend]}")
+
+
+    #def __str__(self):
+    #    return f"{self.ccname}\t{self.q_outstart}\t{self.q_outend}"
+
+
+    def fout(self):
+        try:
+            if self.__class__.fout.called:
+                with open(self.output,"a") as fileout:
+                    fileout.write(self.out+"\n")
+        except AttributeError:
+            try:
+                if os.path.isfile(self.output):
+                    os.remove(self.output)
+                with open(self.output,"a") as fileout:
+                        #fileout.write("##gff-version 3\n")
+                    fileout.write(f"##gff-version 3\n{self.out}\n")
+                self.__class__.fout.called = True
+                self.__class__.fout(self)
+            except TypeError:
+                pass
+
+
+def diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db):
+    # are the  two genes the same?
+    if args.extract:
+        extract = args.extract
+    else:
+        extract = None
+
+    if args.tollerance:
+        tol = args.tollerance
+    else:
+        tol = 30
+
+
+    for otgene in target_genes:
+        # gattr is a variable created to store the  the annotation of the target gene. 
+        # It will be use to suggest a functional annotation
+        gattr = str(otgene).split("\t")[-1]
+        for ccgene in query_genes:
+            algene = GeneComp(ccgene, qstart, qend, otgene, dstart, dend, extract, tol, gattr)
+            if "new" in args.verbosity or "all" in args.verbosity:
+                if algene.is_different():
+                    algene_out = algene.is_different()
+                    algene_name = algene_out.split("\t")[0]
+                    algene_start = int(algene_out.split("\t")[3])
+                    algene_end = int(algene_out.split("\t")[4])
+                    #print(al_name, al_start, al_end)
+                    #print(list(target_db.region(region=(al_name, al_start, al_end), completely_within=True)))
+                    if not list(query_db.region(region=(algene_name, algene_start, algene_end), completely_within=False)):
+                        if args.output:
+                            algene.output=args.output
+                            algene.fout()
+                        else:
+                            print(algene.is_different())
+                            if args.extract:
+                                algene.extract_fasta()
+
+            if "shorter" in args.verbosity or "all" in args.verbosity:
+                if algene.is_shorter():
+                    if args.output:
+                        algene.output=args.output
+                        algene.fout()
+                    else:
+                        print(algene.is_shorter())
+                        if args.extract:
+                            algene.extract_fasta()
+            if "longer" in args.verbosity or "all" in args.verbosity:
+                if algene.is_longer():
+                    if args.output:
+                        algene.output=args.output
+                        algene.fout()
+                    else:
+                        print(algene.is_longer())
+                        if args.extract:
+                            algene.extract_fasta()
+            if "offset" in args.verbosity or "all" in args.verbosity:
+                if algene.is_offset():
+                    if args.output:
+                        algene.output=args.output
+                        algene.fout()
+                    else:
+                        print(algene.is_offset())
+                        if args.extract:
+                            algene.extract_fasta()
+            if "confirmed" in args.verbosity:
+                if algene.is_equal():
+                    if args.output:
+                        algene.output=args.output
+                        algene.fout()
+                    else:
+                        print(algene.is_equal())
+                        if args.extract:
+                            algene.extract_fasta()
+
+
+def gff_gene_check(GffName, db_name, memory=0):
+    # The IDs on the GFFs must be unique. Moreover, because only the gene
+    # information are needed, all the other information must be removed
+    # from the GFFs.
+    tempgff=""
+    for line in open(GffName):
+        if line[0] != "#":
+            if line.split()[2] == "gene":
+                tempgff+=line
+
+        else:
+            tempgff+=line
+    
+    if memory:
+        # Write the db in memory and return it as variable so it can be used
+        # as subclass of _DBCreator
+        dbout = gffutils.create_db(tempgff, ":memory:", from_string=True)
+        return dbout
+    else:
+        gffutils.create_db(tempgff, db_name, from_string=True)
+
+def check_strand(start,leng,strand):
+    if strand == "+":
+        end = start+leng
+    else:
+        end = start-leng
+        start,end = end,start
+    return(start,end)
+
+def check_position(line, query_db,target_db):
+    # check if there is a gene in the aligned area
+    # lets' start with the coordinatescharacteristics
+    elsp = line.split('\t')
+    dname = elsp[1]
+    dstart = int(elsp[2])
+    dlen = int(elsp[3])
+    dstrand = elsp[4]
+    qname = elsp[6]
+    qstart = int(elsp[7])
+    qlen = int(elsp[8])
+    qstrand = elsp[9]
+
+    # check the strand, if - reverse the start and the end
+    qstart,qend = check_strand(qstart,qlen,qstrand)
+    dstart,dend = check_strand(dstart,dlen,dstrand)
+
+    #counter
+    d_counter = 0
+    
+    # lists of gene within the coordinates
+    target_genes = [ gene for gene in list(target_db.region(region=(dname, dstart, dend), completely_within=True))]
+    query_genes = [ gene for gene in list(query_db.region(region=(qname, qstart, qend), completely_within=True))]
+
+    if len(target_genes):
+    # and len(query_genes):
+######
+# PER PROVARE HO TOLTO QUESTO, DA CONTROLLARE 
+######
+        #if len(target_genes) >= len(query_genes): # the number of genes in the aligned area must be bigger in the target (?)
+        diff_gene(query_genes, target_genes, dstart, dend, qstart, qend, query_db)
+
+
+    if d_counter>=1:
+        return(d_counter)
+    else:
+        return(0)
+
+
+def main(args):
+    fcounter = 0
+    # import the GFF library and create (has to be done only once)
+    # the GFF databases
+    target_db = query_db = None
+    db_query_name=args.queryGff + "_db"
+    db_target_name=args.targetGff + "_db"
+
+    if args.force_database:
+        # remove the database if required by user
+        os.remove(db_query_name)
+        os.remove(db_target_name)
+
+    if args.use_query_database:
+        # use the db passed by the user
+        query_db = gffutils.FeatureDB(args.use_query_database)
+    else:
+        #create the db
+        #gffutils.create_db(args.queryGff, db_query_name)
+        qdbout = gff_gene_check(args.queryGff, db_query_name, args.memory)
+
+        if args.memory:
+            query_db = gffutils.FeatureDB(qdbout.dbfn)
+        else:
+            query_db = gffutils.FeatureDB(db_query_name)
+        # except ValueError:
+        #     print("Please check your GFF file")
+        # except:
+        #     raise Exception(f"It seems you already have a db called {db_query_name}. Use the -qu if you want to use it or delete the")
+
+
+    if args.use_target_database:
+        target_db = gffutils.FeatureDB(args.use_target_database)
+    else:
+        tdbout = gff_gene_check(args.targetGff, db_target_name, args.memory)
+        if args.memory:
+            target_db = gffutils.FeatureDB(tdbout.dbfn)
+        else:
+            target_db = gffutils.FeatureDB(db_target_name)
+        # except ValueError:
+        #     print("Please check your GFF file")if "all" in args.verbosity:
+        # except:
+        #     raise Exception(f"It seems you already have a db called {db_target_name}. Use the -du if you want to use it or delete the")
+
+    # put the content of the DB in objects
+    with open(args.aln) as maftab:
+        for line in maftab:
+            if line[0] != "#":
+                fcounter += check_position(line, query_db, target_db)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = '''Tool to extract genes coordinates from a whole genome alignent.
+                        This script needs an alignement in TAB format and two gff files''')
+    parser.add_argument('aln', help='alignment file in TAB format. The suggested way to obtain it is to run Last and\
+                                     than convert the file from MAF to TAB with maf-convert')
+    parser.add_argument('queryGff',
+                        help='''Gff file of the query organism. The gene IDs in the GFF must be unique.''')
+    parser.add_argument('targetGff',
+                        help='''Gff file of the "target" organism. The gene IDs in the GFF must be unique.''')
+    parser.add_argument("-uq", "--use-query-database",
+                        help='''Use this parament if you already have a query gffutils formatted database or
+                                if it\'s not the first time you run this script''', type=str)
+    parser.add_argument("-ut", "--use-target-database",
+                        help='''Use this parament if you already have a target gffutils formatted database or
+                                if it\'s not the first time you run this script''', type=str)
+    parser.add_argument("-fd", "--force-database", help="delete old gffutils databases and create new ones", action='store_true')
+    parser.add_argument("-m", "--memory", help='''create an in-memory database. This option can't be used with the other DB options.
+                                probably usefull in Galaxy integration''', action='store_true' )
+    parser.add_argument("-e", "--extract",
+                        help='''Extract the fasta sequence of the new suggested gene. It takes two argument: the fasta file
+                        of the genome and the name of the output file. This will slow down the process A LOT.''', nargs=2, type=str)
+    #parser.add_argument("-es", "--extract_stout", help='Like -e but it will print the result in the standard output. FASTER than -e. It need the fasta file', type=str)
+    parser.add_argument("-o", "--output", help='Name of the output file. Default output is stout', type=str)
+    parser.add_argument("-t", "--tollerance", help='Interval, in nucleotide, within a gene is considered in the same position. Default 30', default=30, type=int)
+    parser.add_argument("-v", "--verbosity",
+                        help='''Output options. If not specify the software shows only the genes that are in the  exact position of the
+                        genes in the target. It\'s possible to show annotated genes that are in aligned regions but that have different lengths or in slightly 
+                        different positions. It's possible to select multiple, space separated, values.''',
+                        choices=["all","shorter", "longer", "offset", "new", "confirmed"], nargs='*', default='new', type=str)
+                        #metavar=('all','shorter','longer','shifted','new'), 
+                        #action='append', 
+                        #choices=["all","shorter", "longer", "shifted", "new"], default="new")
+    parser.add_argument('--version', action='version', version='0.1.0')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file