Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Tue, 28 Jan 2020 10:42:31 -0500 |
| parents | 965517909457 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:6837f733b4aa | 3:0cbed1c0a762 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
| 2 | 2 |
| 3 import argparse, sys, os | 3 import argparse |
| 4 import os | |
| 4 from Bio import SeqIO | 5 from Bio import SeqIO |
| 5 from Bio.Seq import Seq | 6 from Bio.Seq import Seq |
| 6 from Bio.SeqRecord import SeqRecord | 7 from Bio.SeqRecord import SeqRecord |
| 7 from Bio.Alphabet import IUPAC | |
| 8 import ntpath | 8 import ntpath |
| 9 | 9 |
| 10 def parseID(filename): | 10 version = '1.0' |
| 11 #get wanted feature IDs | 11 |
| 12 gffIDs=[] | 12 |
| 13 with open(filename, 'r') as in_handle: | 13 def parse_id(filename): |
| 14 for line in in_handle: | 14 # get wanted feature IDs |
| 15 line=line.strip() | 15 gff_ids = [] |
| 16 gffIDs.append(line) | 16 with open(filename, 'r') as in_handle: |
| 17 return gffIDs | 17 for line in in_handle: |
| 18 | 18 line = line.strip() |
| 19 def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir): | 19 gff_ids.append(line) |
| 20 #Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. | 20 return gff_ids |
| 21 handle = open(fastaFile, "rU") | 21 |
| 22 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | 22 |
| 23 handle.close() | 23 def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir): |
| 24 | 24 # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. |
| 25 Seq2Get={} | 25 handle = open(fasta_file, "rU") |
| 26 with open(coordFile, 'r') as sequeces2get: | 26 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) |
| 27 for line in sequeces2get: | 27 handle.close() |
| 28 line=line.split(',') | 28 |
| 29 coords=(int(line[-2]), int(line[-1])) | 29 seq_2_get = {} |
| 30 contigID=line[0] | 30 with open(coord_file, 'r') as sequeces2get: |
| 31 if contigID in Seq2Get.keys(): | 31 for line in sequeces2get: |
| 32 Seq2Get[contigID].append(coords) | 32 line = line.split(',') |
| 33 else: | 33 coords = (int(line[-2]), int(line[-1])) |
| 34 Seq2Get[contigID]=[coords] | 34 contig_id = line[0] |
| 35 | 35 if contig_id in list(seq_2_get.keys()): |
| 36 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | 36 seq_2_get[contig_id].append(coords) |
| 37 fails=0 | 37 else: |
| 38 successes=0 | 38 seq_2_get[contig_id] = [coords] |
| 39 records=[] | 39 |
| 40 for contig, listCoords in Seq2Get.items(): | 40 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: |
| 41 contigSeq=records_dict[contig].seq | 41 fails = 0 |
| 42 for coord in listCoords: | 42 successes = 0 |
| 43 coord1=coord[0]-extraSeq | 43 records = [] |
| 44 coord2=coord[1]+extraSeq | 44 for contig, listCoords in list(seq_2_get.items()): |
| 45 if coord1 < 0 or coord2 > len(contigSeq): | 45 contig_seq = records_dict[contig].seq |
| 46 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | 46 for coord in listCoords: |
| 47 fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n') | 47 coord1 = coord[0] - extra_seq |
| 48 fail_log.close() | 48 coord2 = coord[1] + extra_seq |
| 49 fails+=1 | 49 if coord1 < 0 or coord2 > len(contig_seq): |
| 50 else: | 50 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') |
| 51 geneseq=str(contigSeq[coord1:coord2]) | 51 fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n') |
| 52 record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='') | 52 fail_log.close() |
| 53 records.append(record) | 53 fails += 1 |
| 54 successes+=1 | 54 else: |
| 55 SeqIO.write(records, output_handle, "fasta") | 55 geneseq = str(contig_seq[coord1:coord2]) |
| 56 | 56 record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)), |
| 57 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | 57 description='') |
| 58 if fails>0: | 58 records.append(record) |
| 59 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | 59 successes += 1 |
| 60 | 60 SeqIO.write(records, output_handle, "fasta") |
| 61 def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir): | 61 |
| 62 #parsing the sequence file into a SeqIO dictionary. one contig per entry | 62 print('Retrived %s features successfully from %s with %s bp as extra' |
| 63 handle = open(fastaFile, "rU") | 63 ' sequence.' % (str(successes), filename, str(extra_seq))) |
| 64 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | 64 if fails > 0: |
| 65 handle.close() | 65 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) |
| 66 | 66 |
| 67 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | 67 |
| 68 fails=0 | 68 def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir): |
| 69 successes=0 | 69 # parsing the sequence file into a SeqIO dictionary. one contig per entry |
| 70 records=[] | 70 handle = open(fasta_file, "rU") |
| 71 for locus, location in gffFeatures.items(): | 71 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) |
| 72 #print locus | 72 handle.close() |
| 73 contigSeq=records_dict[location[0]].seq | 73 |
| 74 coord1=location[1]-extraSeq | 74 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: |
| 75 coord2=location[2]+extraSeq | 75 fails = 0 |
| 76 if coord1 < 0 or coord2 > len(contigSeq): | 76 successes = 0 |
| 77 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | 77 records = [] |
| 78 fail_log.write(locus+'\n') | 78 for locus, location in list(gff_features.items()): |
| 79 fail_log.close() | 79 # print locus |
| 80 fails+=1 | 80 contig_seq = records_dict[location[0]].seq |
| 81 else: | 81 coord1 = location[1] - extra_seq |
| 82 geneseq=str(contigSeq[coord1:coord2]) | 82 coord2 = location[2] + extra_seq |
| 83 if location[3] == '-': | 83 if coord1 < 0 or coord2 > len(contig_seq): |
| 84 seq = Seq(geneseq) | 84 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') |
| 85 geneseq =str(seq.reverse_complement()) | 85 fail_log.write(locus + '\n') |
| 86 record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='') | 86 fail_log.close() |
| 87 records.append(record) | 87 fails += 1 |
| 88 successes+=1 | 88 else: |
| 89 SeqIO.write(records, output_handle, "fasta") | 89 geneseq = str(contig_seq[coord1:coord2]) |
| 90 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | 90 if location[3] == '-': |
| 91 if fails>0: | 91 seq = Seq(geneseq) |
| 92 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | 92 geneseq = str(seq.reverse_complement()) |
| 93 | 93 record = SeqRecord(Seq(geneseq), |
| 94 def parseFeatures(tempGFF): | 94 id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' + |
| 95 #parsing the feature file into a dictionary | 95 str(location[2])), |
| 96 gffFeatures={} | 96 description='') |
| 97 | 97 records.append(record) |
| 98 with open(tempGFF, 'r') as temp_genes: | 98 successes += 1 |
| 99 for line in temp_genes: | 99 SeqIO.write(records, output_handle, "fasta") |
| 100 line=line.split('\t') | 100 print('Retrived %s features successfully from %s with %s bp as extra' |
| 101 if "CDS" in line[2]: | 101 ' sequence.' % (str(successes), filename, str(extra_seq))) |
| 102 ID=line[-1].split(';') | 102 if fails > 0: |
| 103 locusID=str(ID[0].split('=')[1]) | 103 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) |
| 104 contig=line[0] | 104 |
| 105 begining=int(line[3])-1 #to get the full sequence | 105 |
| 106 end=int(line[4]) | 106 def parse_features(temp_gff): |
| 107 strand=line[6] | 107 # parsing the feature file into a dictionary |
| 108 location=[contig, begining, end, strand] | 108 gff_features = {} |
| 109 gffFeatures[locusID]=location | 109 |
| 110 return gffFeatures | 110 with open(temp_gff, 'r') as temp_genes: |
| 111 | 111 for line in temp_genes: |
| 112 def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None): | 112 line = line.split('\t') |
| 113 | 113 if "CDS" in line[2]: |
| 114 filename=ntpath.basename(gffFile).replace('.gff', '') | 114 id = line[-1].split(';') |
| 115 | 115 locus_id = str(id[0].split('=')[1]) |
| 116 #cleaning temp files if they exist | 116 contig = line[0] |
| 117 if os.path.isfile(outputDir+'/'+filename+'_features.gff'): | 117 begining = int(line[3]) - 1 # to get the full sequence |
| 118 os.remove(outputDir+'/'+filename+'_features.gff') | 118 end = int(line[4]) |
| 119 if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'): | 119 strand = line[6] |
| 120 os.remove(outputDir+'/'+filename+'_sequence.fasta') | 120 location = [contig, begining, end, strand] |
| 121 | 121 gff_features[locus_id] = location |
| 122 #cleaning fails file if it exists | 122 return gff_features |
| 123 if os.path.isfile(outputDir+'/'+filename+'_fails.txt'): | 123 |
| 124 os.remove(outputDir+'/'+filename+'_fails.txt') | 124 |
| 125 | 125 def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None): |
| 126 if coordFile is None: | 126 filename = ntpath.basename(gff_file).replace('.gff', '') |
| 127 | 127 |
| 128 if IDs is not None: | 128 # cleaning temp files if they exist |
| 129 selectIDs=parseID(IDs) | 129 if os.path.isfile(output_dir + '/' + filename + '_features.gff'): |
| 130 else: | 130 os.remove(output_dir + '/' + filename + '_features.gff') |
| 131 selectIDs=None | 131 if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'): |
| 132 | 132 os.remove(output_dir + '/' + filename + '_sequence.fasta') |
| 133 #separating the gff into 2 different files: one with the features and another with the conting sequences | 133 |
| 134 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | 134 # cleaning fails file if it exists |
| 135 for line in in_handle: | 135 if os.path.isfile(output_dir + '/' + filename + '_fails.txt'): |
| 136 if not line.startswith('##'): | 136 os.remove(output_dir + '/' + filename + '_fails.txt') |
| 137 if '\t' in line: | 137 |
| 138 if selectIDs is not None: | 138 if coord_file is None: |
| 139 items=line.split('\t') | 139 |
| 140 ID=items[-1].split(';')[0] | 140 if ids is not None: |
| 141 ID=ID.split('=')[1] | 141 select_ids = parse_id(ids) |
| 142 if ID in selectIDs: | 142 else: |
| 143 temp_genes.write(line) | 143 select_ids = None |
| 144 else: | 144 |
| 145 temp_genes.write(line) | 145 # separating the gff into 2 different files: one with the features and another with the conting sequences |
| 146 else: | 146 with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \ |
| 147 temp_contigs.write(line) | 147 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: |
| 148 | 148 for line in in_handle: |
| 149 gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff') | 149 if not line.startswith('##'): |
| 150 | 150 if '\t' in line: |
| 151 retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir) | 151 if select_ids is not None: |
| 152 | 152 items = line.split('\t') |
| 153 else: | 153 id = items[-1].split(';')[0] |
| 154 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | 154 id = id.split('=')[1] |
| 155 for line in in_handle: | 155 if id in select_ids: |
| 156 if not line.startswith('##'): | 156 temp_genes.write(line) |
| 157 if '\t' in line: | 157 else: |
| 158 pass | 158 temp_genes.write(line) |
| 159 else: | 159 else: |
| 160 temp_contigs.write(line) | 160 temp_contigs.write(line) |
| 161 | 161 |
| 162 retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir) | 162 gff_files = parse_features(output_dir + '/' + filename + '_features.gff') |
| 163 | 163 |
| 164 #removing temp files | 164 retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir) |
| 165 if not keepTemporaryFiles: | 165 |
| 166 try: | 166 else: |
| 167 os.remove(outputDir+'/'+filename+'_features.gff') | 167 with open(gff_file, 'r') as in_handle, \ |
| 168 except: | 168 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: |
| 169 pass | 169 for line in in_handle: |
| 170 os.remove(outputDir+'/'+filename+'_sequence.fasta') | 170 if not line.startswith('##'): |
| 171 if '\t' in line: | |
| 172 pass | |
| 173 else: | |
| 174 temp_contigs.write(line) | |
| 175 | |
| 176 retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir) | |
| 177 | |
| 178 # removing temp files | |
| 179 if not keep_temporary_files: | |
| 180 try: | |
| 181 os.remove(output_dir + '/' + filename + '_features.gff') | |
| 182 except: | |
| 183 pass | |
| 184 os.remove(output_dir + '/' + filename + '_sequence.fasta') | |
| 185 | |
| 171 | 186 |
| 172 def main(): | 187 def main(): |
| 173 | 188 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', |
| 174 version='1.0.0' | 189 epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') |
| 175 | 190 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) |
| 176 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') | 191 |
| 177 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | 192 parser.add_argument('-i', '--input', |
| 178 | 193 help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from' |
| 179 parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True) | 194 ' PROKKA).', type=argparse.FileType('r'), required=True) |
| 180 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) | 195 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, |
| 181 parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') | 196 required=False) |
| 182 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) | 197 parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', |
| 183 | 198 action='store_true') |
| 184 | 199 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', |
| 185 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() | 200 required=False) |
| 186 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) | 201 |
| 187 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False) | 202 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() |
| 188 | 203 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', |
| 189 args = parser.parse_args() | 204 help='txt file with the IDs of interest, one per line', |
| 190 | 205 type=argparse.FileType('r'), required=False) |
| 191 gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | 206 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', |
| 207 help='Sequence coordinates to be retrieved. Requires contig' | |
| 208 ' ID and coords (contig,strart,end) in a csv file. One' | |
| 209 ' per line.', type=argparse.FileType('r'), | |
| 210 required=False) | |
| 211 | |
| 212 args = parser.parse_args() | |
| 213 | |
| 214 args.outputDir = os.path.abspath(args.outputDir) | |
| 215 if not os.path.isdir(args.outputDir): | |
| 216 os.makedirs(args.outputDir) | |
| 217 | |
| 218 gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), | |
| 219 args.keepTemporaryFiles, | |
| 220 os.path.abspath(args.select.name) if args.select is not None else None, | |
| 221 os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | |
| 192 | 222 |
| 193 | 223 |
| 194 if __name__ == "__main__": | 224 if __name__ == "__main__": |
| 195 main() | 225 main() |
