Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
comparison
equal
deleted
inserted
replaced
2:6837f733b4aa | 3:0cbed1c0a762 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
2 | 2 |
3 import argparse, sys, os | 3 import argparse |
4 import os | |
4 from Bio import SeqIO | 5 from Bio import SeqIO |
5 from Bio.Seq import Seq | 6 from Bio.Seq import Seq |
6 from Bio.SeqRecord import SeqRecord | 7 from Bio.SeqRecord import SeqRecord |
7 from Bio.Alphabet import IUPAC | |
8 import ntpath | 8 import ntpath |
9 | 9 |
10 def parseID(filename): | 10 version = '1.0' |
11 #get wanted feature IDs | 11 |
12 gffIDs=[] | 12 |
13 with open(filename, 'r') as in_handle: | 13 def parse_id(filename): |
14 for line in in_handle: | 14 # get wanted feature IDs |
15 line=line.strip() | 15 gff_ids = [] |
16 gffIDs.append(line) | 16 with open(filename, 'r') as in_handle: |
17 return gffIDs | 17 for line in in_handle: |
18 | 18 line = line.strip() |
19 def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir): | 19 gff_ids.append(line) |
20 #Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. | 20 return gff_ids |
21 handle = open(fastaFile, "rU") | 21 |
22 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | 22 |
23 handle.close() | 23 def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir): |
24 | 24 # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. |
25 Seq2Get={} | 25 handle = open(fasta_file, "rU") |
26 with open(coordFile, 'r') as sequeces2get: | 26 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) |
27 for line in sequeces2get: | 27 handle.close() |
28 line=line.split(',') | 28 |
29 coords=(int(line[-2]), int(line[-1])) | 29 seq_2_get = {} |
30 contigID=line[0] | 30 with open(coord_file, 'r') as sequeces2get: |
31 if contigID in Seq2Get.keys(): | 31 for line in sequeces2get: |
32 Seq2Get[contigID].append(coords) | 32 line = line.split(',') |
33 else: | 33 coords = (int(line[-2]), int(line[-1])) |
34 Seq2Get[contigID]=[coords] | 34 contig_id = line[0] |
35 | 35 if contig_id in list(seq_2_get.keys()): |
36 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | 36 seq_2_get[contig_id].append(coords) |
37 fails=0 | 37 else: |
38 successes=0 | 38 seq_2_get[contig_id] = [coords] |
39 records=[] | 39 |
40 for contig, listCoords in Seq2Get.items(): | 40 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: |
41 contigSeq=records_dict[contig].seq | 41 fails = 0 |
42 for coord in listCoords: | 42 successes = 0 |
43 coord1=coord[0]-extraSeq | 43 records = [] |
44 coord2=coord[1]+extraSeq | 44 for contig, listCoords in list(seq_2_get.items()): |
45 if coord1 < 0 or coord2 > len(contigSeq): | 45 contig_seq = records_dict[contig].seq |
46 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | 46 for coord in listCoords: |
47 fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n') | 47 coord1 = coord[0] - extra_seq |
48 fail_log.close() | 48 coord2 = coord[1] + extra_seq |
49 fails+=1 | 49 if coord1 < 0 or coord2 > len(contig_seq): |
50 else: | 50 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') |
51 geneseq=str(contigSeq[coord1:coord2]) | 51 fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n') |
52 record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='') | 52 fail_log.close() |
53 records.append(record) | 53 fails += 1 |
54 successes+=1 | 54 else: |
55 SeqIO.write(records, output_handle, "fasta") | 55 geneseq = str(contig_seq[coord1:coord2]) |
56 | 56 record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)), |
57 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | 57 description='') |
58 if fails>0: | 58 records.append(record) |
59 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | 59 successes += 1 |
60 | 60 SeqIO.write(records, output_handle, "fasta") |
61 def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir): | 61 |
62 #parsing the sequence file into a SeqIO dictionary. one contig per entry | 62 print('Retrived %s features successfully from %s with %s bp as extra' |
63 handle = open(fastaFile, "rU") | 63 ' sequence.' % (str(successes), filename, str(extra_seq))) |
64 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | 64 if fails > 0: |
65 handle.close() | 65 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) |
66 | 66 |
67 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | 67 |
68 fails=0 | 68 def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir): |
69 successes=0 | 69 # parsing the sequence file into a SeqIO dictionary. one contig per entry |
70 records=[] | 70 handle = open(fasta_file, "rU") |
71 for locus, location in gffFeatures.items(): | 71 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) |
72 #print locus | 72 handle.close() |
73 contigSeq=records_dict[location[0]].seq | 73 |
74 coord1=location[1]-extraSeq | 74 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: |
75 coord2=location[2]+extraSeq | 75 fails = 0 |
76 if coord1 < 0 or coord2 > len(contigSeq): | 76 successes = 0 |
77 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | 77 records = [] |
78 fail_log.write(locus+'\n') | 78 for locus, location in list(gff_features.items()): |
79 fail_log.close() | 79 # print locus |
80 fails+=1 | 80 contig_seq = records_dict[location[0]].seq |
81 else: | 81 coord1 = location[1] - extra_seq |
82 geneseq=str(contigSeq[coord1:coord2]) | 82 coord2 = location[2] + extra_seq |
83 if location[3] == '-': | 83 if coord1 < 0 or coord2 > len(contig_seq): |
84 seq = Seq(geneseq) | 84 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') |
85 geneseq =str(seq.reverse_complement()) | 85 fail_log.write(locus + '\n') |
86 record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='') | 86 fail_log.close() |
87 records.append(record) | 87 fails += 1 |
88 successes+=1 | 88 else: |
89 SeqIO.write(records, output_handle, "fasta") | 89 geneseq = str(contig_seq[coord1:coord2]) |
90 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | 90 if location[3] == '-': |
91 if fails>0: | 91 seq = Seq(geneseq) |
92 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | 92 geneseq = str(seq.reverse_complement()) |
93 | 93 record = SeqRecord(Seq(geneseq), |
94 def parseFeatures(tempGFF): | 94 id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' + |
95 #parsing the feature file into a dictionary | 95 str(location[2])), |
96 gffFeatures={} | 96 description='') |
97 | 97 records.append(record) |
98 with open(tempGFF, 'r') as temp_genes: | 98 successes += 1 |
99 for line in temp_genes: | 99 SeqIO.write(records, output_handle, "fasta") |
100 line=line.split('\t') | 100 print('Retrived %s features successfully from %s with %s bp as extra' |
101 if "CDS" in line[2]: | 101 ' sequence.' % (str(successes), filename, str(extra_seq))) |
102 ID=line[-1].split(';') | 102 if fails > 0: |
103 locusID=str(ID[0].split('=')[1]) | 103 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) |
104 contig=line[0] | 104 |
105 begining=int(line[3])-1 #to get the full sequence | 105 |
106 end=int(line[4]) | 106 def parse_features(temp_gff): |
107 strand=line[6] | 107 # parsing the feature file into a dictionary |
108 location=[contig, begining, end, strand] | 108 gff_features = {} |
109 gffFeatures[locusID]=location | 109 |
110 return gffFeatures | 110 with open(temp_gff, 'r') as temp_genes: |
111 | 111 for line in temp_genes: |
112 def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None): | 112 line = line.split('\t') |
113 | 113 if "CDS" in line[2]: |
114 filename=ntpath.basename(gffFile).replace('.gff', '') | 114 id = line[-1].split(';') |
115 | 115 locus_id = str(id[0].split('=')[1]) |
116 #cleaning temp files if they exist | 116 contig = line[0] |
117 if os.path.isfile(outputDir+'/'+filename+'_features.gff'): | 117 begining = int(line[3]) - 1 # to get the full sequence |
118 os.remove(outputDir+'/'+filename+'_features.gff') | 118 end = int(line[4]) |
119 if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'): | 119 strand = line[6] |
120 os.remove(outputDir+'/'+filename+'_sequence.fasta') | 120 location = [contig, begining, end, strand] |
121 | 121 gff_features[locus_id] = location |
122 #cleaning fails file if it exists | 122 return gff_features |
123 if os.path.isfile(outputDir+'/'+filename+'_fails.txt'): | 123 |
124 os.remove(outputDir+'/'+filename+'_fails.txt') | 124 |
125 | 125 def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None): |
126 if coordFile is None: | 126 filename = ntpath.basename(gff_file).replace('.gff', '') |
127 | 127 |
128 if IDs is not None: | 128 # cleaning temp files if they exist |
129 selectIDs=parseID(IDs) | 129 if os.path.isfile(output_dir + '/' + filename + '_features.gff'): |
130 else: | 130 os.remove(output_dir + '/' + filename + '_features.gff') |
131 selectIDs=None | 131 if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'): |
132 | 132 os.remove(output_dir + '/' + filename + '_sequence.fasta') |
133 #separating the gff into 2 different files: one with the features and another with the conting sequences | 133 |
134 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | 134 # cleaning fails file if it exists |
135 for line in in_handle: | 135 if os.path.isfile(output_dir + '/' + filename + '_fails.txt'): |
136 if not line.startswith('##'): | 136 os.remove(output_dir + '/' + filename + '_fails.txt') |
137 if '\t' in line: | 137 |
138 if selectIDs is not None: | 138 if coord_file is None: |
139 items=line.split('\t') | 139 |
140 ID=items[-1].split(';')[0] | 140 if ids is not None: |
141 ID=ID.split('=')[1] | 141 select_ids = parse_id(ids) |
142 if ID in selectIDs: | 142 else: |
143 temp_genes.write(line) | 143 select_ids = None |
144 else: | 144 |
145 temp_genes.write(line) | 145 # separating the gff into 2 different files: one with the features and another with the conting sequences |
146 else: | 146 with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \ |
147 temp_contigs.write(line) | 147 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: |
148 | 148 for line in in_handle: |
149 gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff') | 149 if not line.startswith('##'): |
150 | 150 if '\t' in line: |
151 retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir) | 151 if select_ids is not None: |
152 | 152 items = line.split('\t') |
153 else: | 153 id = items[-1].split(';')[0] |
154 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | 154 id = id.split('=')[1] |
155 for line in in_handle: | 155 if id in select_ids: |
156 if not line.startswith('##'): | 156 temp_genes.write(line) |
157 if '\t' in line: | 157 else: |
158 pass | 158 temp_genes.write(line) |
159 else: | 159 else: |
160 temp_contigs.write(line) | 160 temp_contigs.write(line) |
161 | 161 |
162 retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir) | 162 gff_files = parse_features(output_dir + '/' + filename + '_features.gff') |
163 | 163 |
164 #removing temp files | 164 retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir) |
165 if not keepTemporaryFiles: | 165 |
166 try: | 166 else: |
167 os.remove(outputDir+'/'+filename+'_features.gff') | 167 with open(gff_file, 'r') as in_handle, \ |
168 except: | 168 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: |
169 pass | 169 for line in in_handle: |
170 os.remove(outputDir+'/'+filename+'_sequence.fasta') | 170 if not line.startswith('##'): |
171 if '\t' in line: | |
172 pass | |
173 else: | |
174 temp_contigs.write(line) | |
175 | |
176 retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir) | |
177 | |
178 # removing temp files | |
179 if not keep_temporary_files: | |
180 try: | |
181 os.remove(output_dir + '/' + filename + '_features.gff') | |
182 except: | |
183 pass | |
184 os.remove(output_dir + '/' + filename + '_sequence.fasta') | |
185 | |
171 | 186 |
172 def main(): | 187 def main(): |
173 | 188 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', |
174 version='1.0.0' | 189 epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') |
175 | 190 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) |
176 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') | 191 |
177 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | 192 parser.add_argument('-i', '--input', |
178 | 193 help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from' |
179 parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True) | 194 ' PROKKA).', type=argparse.FileType('r'), required=True) |
180 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) | 195 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, |
181 parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') | 196 required=False) |
182 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) | 197 parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', |
183 | 198 action='store_true') |
184 | 199 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', |
185 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() | 200 required=False) |
186 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) | 201 |
187 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False) | 202 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() |
188 | 203 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', |
189 args = parser.parse_args() | 204 help='txt file with the IDs of interest, one per line', |
190 | 205 type=argparse.FileType('r'), required=False) |
191 gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | 206 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', |
207 help='Sequence coordinates to be retrieved. Requires contig' | |
208 ' ID and coords (contig,strart,end) in a csv file. One' | |
209 ' per line.', type=argparse.FileType('r'), | |
210 required=False) | |
211 | |
212 args = parser.parse_args() | |
213 | |
214 args.outputDir = os.path.abspath(args.outputDir) | |
215 if not os.path.isdir(args.outputDir): | |
216 os.makedirs(args.outputDir) | |
217 | |
218 gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), | |
219 args.keepTemporaryFiles, | |
220 os.path.abspath(args.select.name) if args.select is not None else None, | |
221 os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | |
192 | 222 |
193 | 223 |
194 if __name__ == "__main__": | 224 if __name__ == "__main__": |
195 main() | 225 main() |