comparison deg_annotate.py @ 0:b42373cddb77 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deg_annotate commit bc766aeec78fe74a1d70d608d2f73ba3f2a3e047
author iuc
date Fri, 23 Nov 2018 01:59:47 -0500
parents
children e98d4ab5b5bc
comparison
equal deleted inserted replaced
-1:000000000000 0:b42373cddb77
1 import argparse
2 import os
3 from collections import defaultdict
4
5 from BCBio import GFF
6
7
8 def strandardize(strand):
9 if str(strand) == '-1':
10 strand = '-'
11 elif str(strand) == '1':
12 strand = '+'
13 return strand
14
15
16 def gff_to_dict(f_gff, feat_type, idattr, txattr, attributes, input_type):
17 """
18 It reads only exonic features because not all GFF files contain gene and trascript features. From the exonic
19 features it extracts gene names, biotypes, start and end positions. If any of these attributes do not exit
20 then they are set to NA.
21 """
22 annotation = defaultdict(lambda: defaultdict(lambda: 'NA'))
23 exon_pos = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
24 tx_info = defaultdict(lambda: defaultdict(str))
25
26 with open(f_gff) as gff_handle:
27 for rec in GFF.parse(gff_handle, limit_info=dict(gff_type=[feat_type]), target_lines=1):
28 for sub_feature in rec.features:
29 start = sub_feature.location.start
30 end = sub_feature.location.end
31 strand = strandardize(sub_feature.location.strand)
32 try:
33 geneid = sub_feature.qualifiers[idattr][0]
34 except KeyError:
35 print("No '" + idattr + "' attribute found for the feature at position "
36 + rec.id + ":" + str(start) + ":" + str(end) + ". Please check your GTF/GFF file.")
37 continue
38
39 annotation[geneid]['chr'] = rec.id
40 annotation[geneid]['strand'] = strand
41 if annotation[geneid]['start'] == 'NA' or start <= int(annotation[geneid]['start']):
42 annotation[geneid]['start'] = start
43 if annotation[geneid]['end'] == 'NA' or end >= int(annotation[geneid]['end']):
44 annotation[geneid]['end'] = end
45
46 for attr in attributes:
47 if attr in annotation[geneid]:
48 continue
49 try:
50 annotation[geneid][attr] = sub_feature.qualifiers[attr][0]
51 except KeyError:
52 annotation[geneid][attr] = 'NA'
53 # extract exon information only in case of dexseq output
54 if input_type != "dexseq":
55 continue
56 try:
57 txid = sub_feature.qualifiers[txattr][0]
58 tx_info[txid]['chr'] = rec.id
59 tx_info[txid]['strand'] = strand
60 exon_pos[txid][int(start)][int(end)] = 1
61 except KeyError:
62 print("No '" + txattr + "' attribute found for the feature at position " + rec.id + ":" + str(
63 start) + ":" + str(end) + ". Please check your GTF/GFF file.")
64 pass
65
66 bed_entries = []
67 # create BED lines only for deseq output
68 if input_type == "dexseq":
69 for txid in exon_pos.keys():
70 starts = sorted(exon_pos[txid])
71 strand = tx_info[txid]['strand']
72 if strand == '-':
73 starts = reversed(starts)
74 for c, start in enumerate(starts, 1):
75 ends = sorted(exon_pos[txid][start])
76 if strand == '-':
77 ends = reversed(ends)
78 for end in ends:
79 bed_entries.append('\t'.join([tx_info[txid]['chr'], str(start), str(end),
80 txid + ':' + str(c), '0', strand]))
81
82 return annotation, bed_entries
83
84
85 def main():
86 parser = argparse.ArgumentParser(description='Annotate DESeq2/DEXSeq tables with information from GFF/GTF files')
87 parser.add_argument('-in', '--input', required=True,
88 help='DESeq2/DEXSeq output. It is allowed to have extra information, '
89 'but make sure that the original output columns are not altered')
90 parser.add_argument('-m', '--mode', required=True, choices=["deseq2", "dexseq"], default='deseq2',
91 help='Input file type')
92 parser.add_argument('-g', '--gff', required=True, help='The same annotation GFF/GTF file used for couting')
93 parser.add_argument('-t', '--type', default='exon', required=False,
94 help='feature type (3rd column in GFF file) to be used (default: exon)')
95 parser.add_argument('-i', '--idattr', default='gene_id', required=False,
96 help='GFF attribute to be used as feature ID. '
97 'This should match the first column of DESeq2 output(default: geneid)')
98 parser.add_argument('-x', '--txattr', default='transcript_id', required=False,
99 help='GFF attribute to be used as transcript ID. Used for DEXSeq output only.'
100 'This should match the first column of DESeq2 output(default: transcript_id)')
101 parser.add_argument('-a', '--attributes', default='gene_biotype, gene_name', required=False,
102 help='Comma separated attributes to include in output. Default: gene_biotype, gene_name')
103 parser.add_argument('-o', '--output', required=True, help='Output file')
104 args = parser.parse_args()
105
106 print("DE(X)Seq output file : %s" % args.input)
107 print("Input file type : %s" % args.mode)
108 print("Annotation file : %s" % args.gff)
109 print("Feature type : %s" % args.type)
110 print("ID attribute : %s" % args.idattr)
111 print("Transcript attribute : %s" % args.txattr)
112 print("Attributes to include : %s" % args.attributes)
113 print("Annotated output file : %s" % args.output)
114
115 attr = [x.strip() for x in args.attributes.split(',')]
116 annotation, bed_entries = gff_to_dict(args.gff, args.type, args.idattr, args.txattr, attr, args.mode)
117
118 d_binexon = {}
119 skip_exon_annotation = False
120
121 if args.mode == "dexseq":
122 with open(args.input) as fh_input, open("input.bed", "w") as fh_input_bed:
123 for line in fh_input:
124 f = line.split('\t')
125 fh_input_bed.write('\t'.join([f[11], f[12], f[13], f[0], "0", f[15]]) + "\n")
126
127 if len(bed_entries) == 0 and args.mode == "dexseq":
128 print("It seems there are no transcript ids present in GFF file. Skipping exon annotation.")
129 skip_exon_annotation = True
130
131 if not skip_exon_annotation:
132 with open("annotation.bed", "w") as fh_annotation_bed:
133 for line in bed_entries:
134 fh_annotation_bed.write(line + "\n")
135
136 # interset the DEXseq couting bins with exons in the GFF file
137 # overlaped positions can be later used to infer which bin corresponds to which exon
138 os.system("intersectBed -wo -s -a input.bed -b annotation.bed > overlap.txt")
139
140 with open("overlap.txt") as fh_overlap:
141 for line in fh_overlap:
142 binid = line.split('\t')[3]
143 exonid = line.split('\t')[9]
144 d_binexon.setdefault(binid, []).append(exonid)
145
146 with open(args.input) as fh_input, open(args.output, 'w') as fh_output:
147 for line in fh_input:
148 annot = []
149 # Append the extra information from GFF to DESeq2 output
150 if args.mode == "deseq2":
151 geneid = line.split('\t')[0]
152 annot = [str(annotation[geneid]['chr']),
153 str(annotation[geneid]['start']),
154 str(annotation[geneid]['end']),
155 str(annotation[geneid]['strand'])]
156 for a in attr:
157 annot.append(annotation[geneid][a])
158 # DEXSeq exonic bins might originate from aggrigating multiple genes. They are are separated by '+'
159 # Append the attributes from the GFF but keep the order of the aggregated genes and use '+'
160 # Aappend the transcript id and exon number from the annotation that correspond to the DEXseq counting bins
161 elif args.mode == "dexseq":
162 geneids = line.split('\t')[1].split('+')
163 for a in attr:
164 tmp = []
165 for geneid in geneids:
166 tmp.append(str(annotation[geneid][a]))
167 annot.append('+'.join(tmp))
168 if not skip_exon_annotation:
169 binid = line.split('\t')[0]
170 try:
171 annot.append(','.join(sorted(set(d_binexon[binid]))))
172 except KeyError:
173 annot.append('NA')
174 fh_output.write(line.rstrip('\n') + '\t' + '\t'.join(annot) + '\n')
175
176
177 if __name__ == "__main__":
178 main()