Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/gffParser.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Wed, 22 Jan 2020 08:41:44 -0500 |
| parents | |
| children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:965517909457 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse, sys, os | |
| 4 from Bio import SeqIO | |
| 5 from Bio.Seq import Seq | |
| 6 from Bio.SeqRecord import SeqRecord | |
| 7 from Bio.Alphabet import IUPAC | |
| 8 import ntpath | |
| 9 | |
| 10 def parseID(filename): | |
| 11 #get wanted feature IDs | |
| 12 gffIDs=[] | |
| 13 with open(filename, 'r') as in_handle: | |
| 14 for line in in_handle: | |
| 15 line=line.strip() | |
| 16 gffIDs.append(line) | |
| 17 return gffIDs | |
| 18 | |
| 19 def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir): | |
| 20 #Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. | |
| 21 handle = open(fastaFile, "rU") | |
| 22 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | |
| 23 handle.close() | |
| 24 | |
| 25 Seq2Get={} | |
| 26 with open(coordFile, 'r') as sequeces2get: | |
| 27 for line in sequeces2get: | |
| 28 line=line.split(',') | |
| 29 coords=(int(line[-2]), int(line[-1])) | |
| 30 contigID=line[0] | |
| 31 if contigID in Seq2Get.keys(): | |
| 32 Seq2Get[contigID].append(coords) | |
| 33 else: | |
| 34 Seq2Get[contigID]=[coords] | |
| 35 | |
| 36 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | |
| 37 fails=0 | |
| 38 successes=0 | |
| 39 records=[] | |
| 40 for contig, listCoords in Seq2Get.items(): | |
| 41 contigSeq=records_dict[contig].seq | |
| 42 for coord in listCoords: | |
| 43 coord1=coord[0]-extraSeq | |
| 44 coord2=coord[1]+extraSeq | |
| 45 if coord1 < 0 or coord2 > len(contigSeq): | |
| 46 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | |
| 47 fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n') | |
| 48 fail_log.close() | |
| 49 fails+=1 | |
| 50 else: | |
| 51 geneseq=str(contigSeq[coord1:coord2]) | |
| 52 record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='') | |
| 53 records.append(record) | |
| 54 successes+=1 | |
| 55 SeqIO.write(records, output_handle, "fasta") | |
| 56 | |
| 57 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | |
| 58 if fails>0: | |
| 59 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | |
| 60 | |
| 61 def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir): | |
| 62 #parsing the sequence file into a SeqIO dictionary. one contig per entry | |
| 63 handle = open(fastaFile, "rU") | |
| 64 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | |
| 65 handle.close() | |
| 66 | |
| 67 with open(outputDir+'/'+filename+'.fasta','w') as output_handle: | |
| 68 fails=0 | |
| 69 successes=0 | |
| 70 records=[] | |
| 71 for locus, location in gffFeatures.items(): | |
| 72 #print locus | |
| 73 contigSeq=records_dict[location[0]].seq | |
| 74 coord1=location[1]-extraSeq | |
| 75 coord2=location[2]+extraSeq | |
| 76 if coord1 < 0 or coord2 > len(contigSeq): | |
| 77 fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') | |
| 78 fail_log.write(locus+'\n') | |
| 79 fail_log.close() | |
| 80 fails+=1 | |
| 81 else: | |
| 82 geneseq=str(contigSeq[coord1:coord2]) | |
| 83 if location[3] == '-': | |
| 84 seq = Seq(geneseq) | |
| 85 geneseq =str(seq.reverse_complement()) | |
| 86 record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='') | |
| 87 records.append(record) | |
| 88 successes+=1 | |
| 89 SeqIO.write(records, output_handle, "fasta") | |
| 90 print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) | |
| 91 if fails>0: | |
| 92 print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) | |
| 93 | |
| 94 def parseFeatures(tempGFF): | |
| 95 #parsing the feature file into a dictionary | |
| 96 gffFeatures={} | |
| 97 | |
| 98 with open(tempGFF, 'r') as temp_genes: | |
| 99 for line in temp_genes: | |
| 100 line=line.split('\t') | |
| 101 if "CDS" in line[2]: | |
| 102 ID=line[-1].split(';') | |
| 103 locusID=str(ID[0].split('=')[1]) | |
| 104 contig=line[0] | |
| 105 begining=int(line[3])-1 #to get the full sequence | |
| 106 end=int(line[4]) | |
| 107 strand=line[6] | |
| 108 location=[contig, begining, end, strand] | |
| 109 gffFeatures[locusID]=location | |
| 110 return gffFeatures | |
| 111 | |
| 112 def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None): | |
| 113 | |
| 114 filename=ntpath.basename(gffFile).replace('.gff', '') | |
| 115 | |
| 116 #cleaning temp files if they exist | |
| 117 if os.path.isfile(outputDir+'/'+filename+'_features.gff'): | |
| 118 os.remove(outputDir+'/'+filename+'_features.gff') | |
| 119 if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'): | |
| 120 os.remove(outputDir+'/'+filename+'_sequence.fasta') | |
| 121 | |
| 122 #cleaning fails file if it exists | |
| 123 if os.path.isfile(outputDir+'/'+filename+'_fails.txt'): | |
| 124 os.remove(outputDir+'/'+filename+'_fails.txt') | |
| 125 | |
| 126 if coordFile is None: | |
| 127 | |
| 128 if IDs is not None: | |
| 129 selectIDs=parseID(IDs) | |
| 130 else: | |
| 131 selectIDs=None | |
| 132 | |
| 133 #separating the gff into 2 different files: one with the features and another with the conting sequences | |
| 134 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | |
| 135 for line in in_handle: | |
| 136 if not line.startswith('##'): | |
| 137 if '\t' in line: | |
| 138 if selectIDs is not None: | |
| 139 items=line.split('\t') | |
| 140 ID=items[-1].split(';')[0] | |
| 141 ID=ID.split('=')[1] | |
| 142 if ID in selectIDs: | |
| 143 temp_genes.write(line) | |
| 144 else: | |
| 145 temp_genes.write(line) | |
| 146 else: | |
| 147 temp_contigs.write(line) | |
| 148 | |
| 149 gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff') | |
| 150 | |
| 151 retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir) | |
| 152 | |
| 153 else: | |
| 154 with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: | |
| 155 for line in in_handle: | |
| 156 if not line.startswith('##'): | |
| 157 if '\t' in line: | |
| 158 pass | |
| 159 else: | |
| 160 temp_contigs.write(line) | |
| 161 | |
| 162 retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir) | |
| 163 | |
| 164 #removing temp files | |
| 165 if not keepTemporaryFiles: | |
| 166 try: | |
| 167 os.remove(outputDir+'/'+filename+'_features.gff') | |
| 168 except: | |
| 169 pass | |
| 170 os.remove(outputDir+'/'+filename+'_sequence.fasta') | |
| 171 | |
| 172 def main(): | |
| 173 | |
| 174 version='1.0.0' | |
| 175 | |
| 176 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') | |
| 177 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | |
| 178 | |
| 179 parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True) | |
| 180 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) | |
| 181 parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') | |
| 182 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) | |
| 183 | |
| 184 | |
| 185 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() | |
| 186 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) | |
| 187 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False) | |
| 188 | |
| 189 args = parser.parse_args() | |
| 190 | |
| 191 gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | |
| 192 | |
| 193 | |
| 194 if __name__ == "__main__": | |
| 195 main() |
