annotate scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
1 #!/usr/bin/env python3
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
2
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
3 import argparse
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
4 import os
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
5 from Bio import SeqIO
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
6 from Bio.Seq import Seq
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
8 import ntpath
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
9
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
10 version = '1.0'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
11
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
12
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
13 def parse_id(filename):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
14 # get wanted feature IDs
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
15 gff_ids = []
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
16 with open(filename, 'r') as in_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
17 for line in in_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
18 line = line.strip()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
19 gff_ids.append(line)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
20 return gff_ids
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
21
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
22
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
23 def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
24 # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences.
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
25 handle = open(fasta_file, "rU")
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
26 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
27 handle.close()
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
28
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
29 seq_2_get = {}
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
30 with open(coord_file, 'r') as sequeces2get:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
31 for line in sequeces2get:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
32 line = line.split(',')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
33 coords = (int(line[-2]), int(line[-1]))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
34 contig_id = line[0]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
35 if contig_id in list(seq_2_get.keys()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
36 seq_2_get[contig_id].append(coords)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
37 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
38 seq_2_get[contig_id] = [coords]
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
39
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
40 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
41 fails = 0
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
42 successes = 0
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
43 records = []
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
44 for contig, listCoords in list(seq_2_get.items()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
45 contig_seq = records_dict[contig].seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
46 for coord in listCoords:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
47 coord1 = coord[0] - extra_seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
48 coord2 = coord[1] + extra_seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
49 if coord1 < 0 or coord2 > len(contig_seq):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
50 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
51 fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
52 fail_log.close()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
53 fails += 1
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
54 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
55 geneseq = str(contig_seq[coord1:coord2])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
56 record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
57 description='')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
58 records.append(record)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
59 successes += 1
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
60 SeqIO.write(records, output_handle, "fasta")
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
61
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
62 print('Retrived %s features successfully from %s with %s bp as extra'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
63 ' sequence.' % (str(successes), filename, str(extra_seq)))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
64 if fails > 0:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
65 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
66
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
67
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
68 def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
69 # parsing the sequence file into a SeqIO dictionary. one contig per entry
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
70 handle = open(fasta_file, "rU")
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
71 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
72 handle.close()
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
73
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
74 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
75 fails = 0
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
76 successes = 0
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
77 records = []
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
78 for locus, location in list(gff_features.items()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
79 # print locus
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
80 contig_seq = records_dict[location[0]].seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
81 coord1 = location[1] - extra_seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
82 coord2 = location[2] + extra_seq
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
83 if coord1 < 0 or coord2 > len(contig_seq):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
84 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
85 fail_log.write(locus + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
86 fail_log.close()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
87 fails += 1
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
88 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
89 geneseq = str(contig_seq[coord1:coord2])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
90 if location[3] == '-':
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
91 seq = Seq(geneseq)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
92 geneseq = str(seq.reverse_complement())
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
93 record = SeqRecord(Seq(geneseq),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
94 id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' +
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
95 str(location[2])),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
96 description='')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
97 records.append(record)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
98 successes += 1
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
99 SeqIO.write(records, output_handle, "fasta")
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
100 print('Retrived %s features successfully from %s with %s bp as extra'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
101 ' sequence.' % (str(successes), filename, str(extra_seq)))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
102 if fails > 0:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
103 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
104
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
105
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
106 def parse_features(temp_gff):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
107 # parsing the feature file into a dictionary
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
108 gff_features = {}
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
109
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
110 with open(temp_gff, 'r') as temp_genes:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
111 for line in temp_genes:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
112 line = line.split('\t')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
113 if "CDS" in line[2]:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
114 id = line[-1].split(';')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
115 locus_id = str(id[0].split('=')[1])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
116 contig = line[0]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
117 begining = int(line[3]) - 1 # to get the full sequence
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
118 end = int(line[4])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
119 strand = line[6]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
120 location = [contig, begining, end, strand]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
121 gff_features[locus_id] = location
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
122 return gff_features
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
123
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
124
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
125 def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
126 filename = ntpath.basename(gff_file).replace('.gff', '')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
127
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
128 # cleaning temp files if they exist
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
129 if os.path.isfile(output_dir + '/' + filename + '_features.gff'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
130 os.remove(output_dir + '/' + filename + '_features.gff')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
131 if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
132 os.remove(output_dir + '/' + filename + '_sequence.fasta')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
133
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
134 # cleaning fails file if it exists
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
135 if os.path.isfile(output_dir + '/' + filename + '_fails.txt'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
136 os.remove(output_dir + '/' + filename + '_fails.txt')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
137
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
138 if coord_file is None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
139
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
140 if ids is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
141 select_ids = parse_id(ids)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
142 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
143 select_ids = None
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
144
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
145 # separating the gff into 2 different files: one with the features and another with the conting sequences
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
146 with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
147 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
148 for line in in_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
149 if not line.startswith('##'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
150 if '\t' in line:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
151 if select_ids is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
152 items = line.split('\t')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
153 id = items[-1].split(';')[0]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
154 id = id.split('=')[1]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
155 if id in select_ids:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
156 temp_genes.write(line)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
157 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
158 temp_genes.write(line)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
159 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
160 temp_contigs.write(line)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
161
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
162 gff_files = parse_features(output_dir + '/' + filename + '_features.gff')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
163
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
164 retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
165
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
166 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
167 with open(gff_file, 'r') as in_handle, \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
168 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
169 for line in in_handle:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
170 if not line.startswith('##'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
171 if '\t' in line:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
172 pass
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
173 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
174 temp_contigs.write(line)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
175
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
176 retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
177
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
178 # removing temp files
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
179 if not keep_temporary_files:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
180 try:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
181 os.remove(output_dir + '/' + filename + '_features.gff')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
182 except:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
183 pass
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
184 os.remove(output_dir + '/' + filename + '_sequence.fasta')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
185
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
186
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
187 def main():
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
188 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
189 epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
190 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
191
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
192 parser.add_argument('-i', '--input',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
193 help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
194 ' PROKKA).', type=argparse.FileType('r'), required=True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
195 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
196 required=False)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
197 parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
198 action='store_true')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
199 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
200 required=False)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
201
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
202 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
203 parser_optional_selected_regions_exclusive.add_argument('-s', '--select',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
204 help='txt file with the IDs of interest, one per line',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
205 type=argparse.FileType('r'), required=False)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
206 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
207 help='Sequence coordinates to be retrieved. Requires contig'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
208 ' ID and coords (contig,strart,end) in a csv file. One'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
209 ' per line.', type=argparse.FileType('r'),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
210 required=False)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
211
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
212 args = parser.parse_args()
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
213
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
214 args.outputDir = os.path.abspath(args.outputDir)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
215 if not os.path.isdir(args.outputDir):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
216 os.makedirs(args.outputDir)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
217
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
218 gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
219 args.keepTemporaryFiles,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
220 os.path.abspath(args.select.name) if args.select is not None else None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
221 os.path.abspath(args.fromFile.name) if args.fromFile is not None else None)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
222
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
223
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
224 if __name__ == "__main__":
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
225 main()