comparison scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 2
3 import argparse, sys, os 3 import argparse
4 import os
4 from Bio import SeqIO 5 from Bio import SeqIO
5 from Bio.Seq import Seq 6 from Bio.Seq import Seq
6 from Bio.SeqRecord import SeqRecord 7 from Bio.SeqRecord import SeqRecord
7 from Bio.Alphabet import IUPAC
8 import ntpath 8 import ntpath
9 9
10 def parseID(filename): 10 version = '1.0'
11 #get wanted feature IDs 11
12 gffIDs=[] 12
13 with open(filename, 'r') as in_handle: 13 def parse_id(filename):
14 for line in in_handle: 14 # get wanted feature IDs
15 line=line.strip() 15 gff_ids = []
16 gffIDs.append(line) 16 with open(filename, 'r') as in_handle:
17 return gffIDs 17 for line in in_handle:
18 18 line = line.strip()
19 def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir): 19 gff_ids.append(line)
20 #Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. 20 return gff_ids
21 handle = open(fastaFile, "rU") 21
22 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) 22
23 handle.close() 23 def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir):
24 24 # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences.
25 Seq2Get={} 25 handle = open(fasta_file, "rU")
26 with open(coordFile, 'r') as sequeces2get: 26 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
27 for line in sequeces2get: 27 handle.close()
28 line=line.split(',') 28
29 coords=(int(line[-2]), int(line[-1])) 29 seq_2_get = {}
30 contigID=line[0] 30 with open(coord_file, 'r') as sequeces2get:
31 if contigID in Seq2Get.keys(): 31 for line in sequeces2get:
32 Seq2Get[contigID].append(coords) 32 line = line.split(',')
33 else: 33 coords = (int(line[-2]), int(line[-1]))
34 Seq2Get[contigID]=[coords] 34 contig_id = line[0]
35 35 if contig_id in list(seq_2_get.keys()):
36 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: 36 seq_2_get[contig_id].append(coords)
37 fails=0 37 else:
38 successes=0 38 seq_2_get[contig_id] = [coords]
39 records=[] 39
40 for contig, listCoords in Seq2Get.items(): 40 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle:
41 contigSeq=records_dict[contig].seq 41 fails = 0
42 for coord in listCoords: 42 successes = 0
43 coord1=coord[0]-extraSeq 43 records = []
44 coord2=coord[1]+extraSeq 44 for contig, listCoords in list(seq_2_get.items()):
45 if coord1 < 0 or coord2 > len(contigSeq): 45 contig_seq = records_dict[contig].seq
46 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') 46 for coord in listCoords:
47 fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n') 47 coord1 = coord[0] - extra_seq
48 fail_log.close() 48 coord2 = coord[1] + extra_seq
49 fails+=1 49 if coord1 < 0 or coord2 > len(contig_seq):
50 else: 50 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a')
51 geneseq=str(contigSeq[coord1:coord2]) 51 fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n')
52 record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='') 52 fail_log.close()
53 records.append(record) 53 fails += 1
54 successes+=1 54 else:
55 SeqIO.write(records, output_handle, "fasta") 55 geneseq = str(contig_seq[coord1:coord2])
56 56 record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)),
57 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) 57 description='')
58 if fails>0: 58 records.append(record)
59 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) 59 successes += 1
60 60 SeqIO.write(records, output_handle, "fasta")
61 def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir): 61
62 #parsing the sequence file into a SeqIO dictionary. one contig per entry 62 print('Retrived %s features successfully from %s with %s bp as extra'
63 handle = open(fastaFile, "rU") 63 ' sequence.' % (str(successes), filename, str(extra_seq)))
64 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) 64 if fails > 0:
65 handle.close() 65 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename))
66 66
67 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: 67
68 fails=0 68 def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir):
69 successes=0 69 # parsing the sequence file into a SeqIO dictionary. one contig per entry
70 records=[] 70 handle = open(fasta_file, "rU")
71 for locus, location in gffFeatures.items(): 71 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
72 #print locus 72 handle.close()
73 contigSeq=records_dict[location[0]].seq 73
74 coord1=location[1]-extraSeq 74 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle:
75 coord2=location[2]+extraSeq 75 fails = 0
76 if coord1 < 0 or coord2 > len(contigSeq): 76 successes = 0
77 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') 77 records = []
78 fail_log.write(locus+'\n') 78 for locus, location in list(gff_features.items()):
79 fail_log.close() 79 # print locus
80 fails+=1 80 contig_seq = records_dict[location[0]].seq
81 else: 81 coord1 = location[1] - extra_seq
82 geneseq=str(contigSeq[coord1:coord2]) 82 coord2 = location[2] + extra_seq
83 if location[3] == '-': 83 if coord1 < 0 or coord2 > len(contig_seq):
84 seq = Seq(geneseq) 84 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a')
85 geneseq =str(seq.reverse_complement()) 85 fail_log.write(locus + '\n')
86 record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='') 86 fail_log.close()
87 records.append(record) 87 fails += 1
88 successes+=1 88 else:
89 SeqIO.write(records, output_handle, "fasta") 89 geneseq = str(contig_seq[coord1:coord2])
90 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) 90 if location[3] == '-':
91 if fails>0: 91 seq = Seq(geneseq)
92 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) 92 geneseq = str(seq.reverse_complement())
93 93 record = SeqRecord(Seq(geneseq),
94 def parseFeatures(tempGFF): 94 id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' +
95 #parsing the feature file into a dictionary 95 str(location[2])),
96 gffFeatures={} 96 description='')
97 97 records.append(record)
98 with open(tempGFF, 'r') as temp_genes: 98 successes += 1
99 for line in temp_genes: 99 SeqIO.write(records, output_handle, "fasta")
100 line=line.split('\t') 100 print('Retrived %s features successfully from %s with %s bp as extra'
101 if "CDS" in line[2]: 101 ' sequence.' % (str(successes), filename, str(extra_seq)))
102 ID=line[-1].split(';') 102 if fails > 0:
103 locusID=str(ID[0].split('=')[1]) 103 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename))
104 contig=line[0] 104
105 begining=int(line[3])-1 #to get the full sequence 105
106 end=int(line[4]) 106 def parse_features(temp_gff):
107 strand=line[6] 107 # parsing the feature file into a dictionary
108 location=[contig, begining, end, strand] 108 gff_features = {}
109 gffFeatures[locusID]=location 109
110 return gffFeatures 110 with open(temp_gff, 'r') as temp_genes:
111 111 for line in temp_genes:
112 def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None): 112 line = line.split('\t')
113 113 if "CDS" in line[2]:
114 filename=ntpath.basename(gffFile).replace('.gff', '') 114 id = line[-1].split(';')
115 115 locus_id = str(id[0].split('=')[1])
116 #cleaning temp files if they exist 116 contig = line[0]
117 if os.path.isfile(outputDir+'/'+filename+'_features.gff'): 117 begining = int(line[3]) - 1 # to get the full sequence
118 os.remove(outputDir+'/'+filename+'_features.gff') 118 end = int(line[4])
119 if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'): 119 strand = line[6]
120 os.remove(outputDir+'/'+filename+'_sequence.fasta') 120 location = [contig, begining, end, strand]
121 121 gff_features[locus_id] = location
122 #cleaning fails file if it exists 122 return gff_features
123 if os.path.isfile(outputDir+'/'+filename+'_fails.txt'): 123
124 os.remove(outputDir+'/'+filename+'_fails.txt') 124
125 125 def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None):
126 if coordFile is None: 126 filename = ntpath.basename(gff_file).replace('.gff', '')
127 127
128 if IDs is not None: 128 # cleaning temp files if they exist
129 selectIDs=parseID(IDs) 129 if os.path.isfile(output_dir + '/' + filename + '_features.gff'):
130 else: 130 os.remove(output_dir + '/' + filename + '_features.gff')
131 selectIDs=None 131 if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'):
132 132 os.remove(output_dir + '/' + filename + '_sequence.fasta')
133 #separating the gff into 2 different files: one with the features and another with the conting sequences 133
134 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: 134 # cleaning fails file if it exists
135 for line in in_handle: 135 if os.path.isfile(output_dir + '/' + filename + '_fails.txt'):
136 if not line.startswith('##'): 136 os.remove(output_dir + '/' + filename + '_fails.txt')
137 if '\t' in line: 137
138 if selectIDs is not None: 138 if coord_file is None:
139 items=line.split('\t') 139
140 ID=items[-1].split(';')[0] 140 if ids is not None:
141 ID=ID.split('=')[1] 141 select_ids = parse_id(ids)
142 if ID in selectIDs: 142 else:
143 temp_genes.write(line) 143 select_ids = None
144 else: 144
145 temp_genes.write(line) 145 # separating the gff into 2 different files: one with the features and another with the conting sequences
146 else: 146 with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \
147 temp_contigs.write(line) 147 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs:
148 148 for line in in_handle:
149 gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff') 149 if not line.startswith('##'):
150 150 if '\t' in line:
151 retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir) 151 if select_ids is not None:
152 152 items = line.split('\t')
153 else: 153 id = items[-1].split(';')[0]
154 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: 154 id = id.split('=')[1]
155 for line in in_handle: 155 if id in select_ids:
156 if not line.startswith('##'): 156 temp_genes.write(line)
157 if '\t' in line: 157 else:
158 pass 158 temp_genes.write(line)
159 else: 159 else:
160 temp_contigs.write(line) 160 temp_contigs.write(line)
161 161
162 retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir) 162 gff_files = parse_features(output_dir + '/' + filename + '_features.gff')
163 163
164 #removing temp files 164 retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir)
165 if not keepTemporaryFiles: 165
166 try: 166 else:
167 os.remove(outputDir+'/'+filename+'_features.gff') 167 with open(gff_file, 'r') as in_handle, \
168 except: 168 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs:
169 pass 169 for line in in_handle:
170 os.remove(outputDir+'/'+filename+'_sequence.fasta') 170 if not line.startswith('##'):
171 if '\t' in line:
172 pass
173 else:
174 temp_contigs.write(line)
175
176 retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir)
177
178 # removing temp files
179 if not keep_temporary_files:
180 try:
181 os.remove(output_dir + '/' + filename + '_features.gff')
182 except:
183 pass
184 os.remove(output_dir + '/' + filename + '_sequence.fasta')
185
171 186
172 def main(): 187 def main():
173 188 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.',
174 version='1.0.0' 189 epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)')
175 190 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
176 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') 191
177 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) 192 parser.add_argument('-i', '--input',
178 193 help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from'
179 parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True) 194 ' PROKKA).', type=argparse.FileType('r'), required=True)
180 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) 195 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int,
181 parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') 196 required=False)
182 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) 197 parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.',
183 198 action='store_true')
184 199 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.',
185 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() 200 required=False)
186 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) 201
187 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False) 202 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group()
188 203 parser_optional_selected_regions_exclusive.add_argument('-s', '--select',
189 args = parser.parse_args() 204 help='txt file with the IDs of interest, one per line',
190 205 type=argparse.FileType('r'), required=False)
191 gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) 206 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile',
207 help='Sequence coordinates to be retrieved. Requires contig'
208 ' ID and coords (contig,strart,end) in a csv file. One'
209 ' per line.', type=argparse.FileType('r'),
210 required=False)
211
212 args = parser.parse_args()
213
214 args.outputDir = os.path.abspath(args.outputDir)
215 if not os.path.isdir(args.outputDir):
216 os.makedirs(args.outputDir)
217
218 gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir),
219 args.keepTemporaryFiles,
220 os.path.abspath(args.select.name) if args.select is not None else None,
221 os.path.abspath(args.fromFile.name) if args.fromFile is not None else None)
192 222
193 223
194 if __name__ == "__main__": 224 if __name__ == "__main__":
195 main() 225 main()