Mercurial > repos > artbio > blastparser_and_hits
comparison BlastParser_and_hits.py @ 0:9dfb65ebb02e draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastparser_and_hits commit 48132e5edac97d54804ccbaf620068a5fb800bdc
| author | artbio |
|---|---|
| date | Sun, 15 Oct 2017 18:43:37 -0400 |
| parents | |
| children | 36103afa0934 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9dfb65ebb02e |
|---|---|
| 1 #!/usr/bin/python | |
| 2 import argparse | |
| 3 from collections import defaultdict | |
| 4 | |
| 5 | |
| 6 def Parser(): | |
| 7 the_parser = argparse.ArgumentParser() | |
| 8 the_parser.add_argument('--blast', action="store", type=str, | |
| 9 help="Path to the blast output\ | |
| 10 (tabular format, 12 column)") | |
| 11 the_parser.add_argument('--sequences', action="store", type=str, | |
| 12 help="Path to the fasta file with blasted\ | |
| 13 sequences") | |
| 14 the_parser.add_argument('--fastaOutput', action="store", type=str, | |
| 15 help="fasta output file of blast hits") | |
| 16 the_parser.add_argument('--tabularOutput', action="store", type=str, | |
| 17 help="tabular output file of blast analysis") | |
| 18 the_parser.add_argument('--flanking', action="store", type=int, | |
| 19 help="number of flanking nucleotides\ | |
| 20 added to the hit sequences") | |
| 21 the_parser.add_argument('--mode', action="store", | |
| 22 choices=["verbose", "short"], type=str, | |
| 23 help="reporting (verbose) or not reporting (short)\ | |
| 24 oases contigs") | |
| 25 the_parser.add_argument('--filter_relativeCov', action="store", type=float, | |
| 26 default=0, | |
| 27 help="filter out relative coverages\ | |
| 28 below the specified ratio (float number)") | |
| 29 the_parser.add_argument('--filter_maxScore', action="store", type=float, | |
| 30 default=0, help="filter out best BitScores below\ | |
| 31 the specified float number") | |
| 32 the_parser.add_argument('--filter_meanScore', action="store", type=float, | |
| 33 default=0, | |
| 34 help="filter out mean BitScores below the\ | |
| 35 specified float number") | |
| 36 the_parser.add_argument('--filter_term_in', action="store", type=str, | |
| 37 default="", | |
| 38 help="select the specified term in the\ | |
| 39 subject list") | |
| 40 the_parser.add_argument('--filter_term_out', action="store", type=str, | |
| 41 default="", | |
| 42 help="exclude the specified term from\ | |
| 43 the subject list") | |
| 44 the_parser.add_argument('--al_sequences', action="store", type=str, | |
| 45 help="sequences that have been blast aligned") | |
| 46 the_parser.add_argument('--un_sequences', action="store", type=str, | |
| 47 help="sequences that have not been blast aligned") | |
| 48 the_parser.add_argument('--dataset_name', action="store", type=str, | |
| 49 default="", | |
| 50 help="the name of the dataset that has been parsed,\ | |
| 51 to be reported in the output") | |
| 52 args = the_parser.parse_args() | |
| 53 if not all((args.sequences, args.blast, args.fastaOutput, | |
| 54 args.tabularOutput)): | |
| 55 the_parser.error('argument(s) missing, call the\ | |
| 56 -h option of the script') | |
| 57 if not args.flanking: | |
| 58 args.flanking = 0 | |
| 59 return args | |
| 60 | |
| 61 | |
| 62 def median(lst): | |
| 63 lst = sorted(lst) | |
| 64 if len(lst) < 1: | |
| 65 return None | |
| 66 if len(lst) % 2 == 1: | |
| 67 return lst[((len(lst)+1)/2)-1] | |
| 68 if len(lst) % 2 == 0: | |
| 69 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 | |
| 70 | |
| 71 | |
| 72 def mean(lst): | |
| 73 if len(lst) < 1: | |
| 74 return 0 | |
| 75 return sum(lst) / float(len(lst)) | |
| 76 | |
| 77 | |
| 78 def getfasta(fastafile): | |
| 79 fastadic = {} | |
| 80 for line in open(fastafile): | |
| 81 if line[0] == ">": | |
| 82 header = line[1:-1] | |
| 83 fastadic[header] = "" | |
| 84 else: | |
| 85 fastadic[header] += line | |
| 86 for header in fastadic: | |
| 87 fastadic[header] = "".join(fastadic[header].split("\n")) | |
| 88 return fastadic | |
| 89 | |
| 90 | |
| 91 def insert_newlines(string, every=60): | |
| 92 lines = [] | |
| 93 for i in range(0, len(string), every): | |
| 94 lines.append(string[i:i+every]) | |
| 95 return '\n'.join(lines) | |
| 96 | |
| 97 | |
| 98 def getblast(blastfile): | |
| 99 '''blastinfo [0] Percentage of identical matches | |
| 100 blastinfo [1] Alignment length | |
| 101 blastinfo [2] Number of mismatches | |
| 102 blastinfo [3] Number of gap openings | |
| 103 blastinfo [4] Start of alignment in query | |
| 104 blastinfo [5] End of alignment in query | |
| 105 blastinfo [6] Start of alignment in subject (database hit) | |
| 106 blastinfo [7] End of alignment in subject (database hit) | |
| 107 blastinfo [8] Expectation value (E-value) | |
| 108 blastinfo [9] Bit score | |
| 109 blastinfo [10] Subject length | |
| 110 (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' | |
| 111 blastdic = defaultdict(dict) | |
| 112 for line in open(blastfile): | |
| 113 fields = line[:-1].split("\t") | |
| 114 transcript = fields[0] | |
| 115 subject = fields[1] | |
| 116 # blastinfo[0] | |
| 117 blastinfo = [float(fields[2])] | |
| 118 # blastinfo[1:8] insets 1 to 7 | |
| 119 blastinfo = blastinfo + [int(i) for i in fields[3:10]] | |
| 120 # blastinfo[8] E-value remains as a string type | |
| 121 blastinfo.append(fields[10]) | |
| 122 # blastinfo[9] Bit score | |
| 123 blastinfo.append(float(fields[11])) | |
| 124 # blastinfo[10] Subject length MUST BE RETRIEVED | |
| 125 # THROUGH A 13 COLUMN BLAST OUTPUT | |
| 126 blastinfo.append(int(fields[12])) | |
| 127 try: | |
| 128 blastdic[subject][transcript].append(blastinfo) | |
| 129 except Exception: | |
| 130 blastdic[subject][transcript] = [blastinfo] | |
| 131 return blastdic | |
| 132 | |
| 133 | |
| 134 def getseq(fastadict, transcript, up, down, orientation="direct"): | |
| 135 def reverse(seq): | |
| 136 revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | |
| 137 revseq = [revdict[i] for i in seq[::-1]] | |
| 138 return "".join(revseq) | |
| 139 pickseq = fastadict[transcript][up-1:down] | |
| 140 if orientation == "direct": | |
| 141 return pickseq | |
| 142 else: | |
| 143 return reverse(pickseq) | |
| 144 | |
| 145 | |
| 146 def subjectCoverage(fastadict, blastdict, subject, | |
| 147 QueriesFlankingNucleotides=0): | |
| 148 SubjectCoverageList = [] | |
| 149 HitDic = {} | |
| 150 bitScores = [] | |
| 151 for transcript in blastdict[subject]: | |
| 152 prefix = "%s--%s_" % (subject, transcript) | |
| 153 hitNumber = 0 | |
| 154 for hit in blastdict[subject][transcript]: | |
| 155 hitNumber += 1 | |
| 156 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, | |
| 157 hit[0], | |
| 158 hit[1], | |
| 159 hit[8]) | |
| 160 # query coverage by a hit is in hit[4:6] | |
| 161 HitDic[prefix+suffix] = GetHitSequence(fastadict, transcript, | |
| 162 hit[4], hit[5], | |
| 163 QueriesFlankingNucleotides) | |
| 164 # subject coverage by a hit is in hit[6:8] | |
| 165 SubjectCoverageList += range(min([hit[6], hit[7]]), | |
| 166 max([hit[6], hit[7]]) + 1) | |
| 167 bitScores.append(hit[9]) | |
| 168 # always the same value for a given subject. Stupid but simple | |
| 169 subjectLength = hit[10] | |
| 170 TotalSubjectCoverage = len(set(SubjectCoverageList)) | |
| 171 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) | |
| 172 return (HitDic, subjectLength, TotalSubjectCoverage, | |
| 173 RelativeSubjectCoverage, max(bitScores), mean(bitScores)) | |
| 174 | |
| 175 | |
| 176 def GetHitSequence(fastadict, FastaHeader, leftCoordinate, rightCoordinate, | |
| 177 FlankingValue): | |
| 178 if rightCoordinate > leftCoordinate: | |
| 179 polarity = "direct" | |
| 180 else: | |
| 181 polarity = "reverse" | |
| 182 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate | |
| 183 if leftCoordinate - FlankingValue > 0: | |
| 184 leftCoordinate -= FlankingValue | |
| 185 else: | |
| 186 leftCoordinate = 1 | |
| 187 return getseq(fastadict, FastaHeader, leftCoordinate, rightCoordinate, | |
| 188 polarity) | |
| 189 | |
| 190 | |
| 191 def outputParsing(dataset_name, F, Fasta, results, Xblastdict, fastadict, | |
| 192 filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, | |
| 193 filter_term_in="", filter_term_out="", mode="verbose"): | |
| 194 def filter_results(results, filter_relativeCov=0, filter_maxScore=0, | |
| 195 filter_meanScore=0, filter_term_in="", | |
| 196 filter_term_out=""): | |
| 197 for subject in results.keys(): | |
| 198 if results[subject][ | |
| 199 "RelativeSubjectCoverage"] < filter_relativeCov: | |
| 200 del results[subject] | |
| 201 continue | |
| 202 if results[subject]["maxBitScores"] < filter_maxScore: | |
| 203 del results[subject] | |
| 204 continue | |
| 205 if results[subject]["meanBitScores"] < filter_meanScore: | |
| 206 del results[subject] | |
| 207 continue | |
| 208 if filter_term_in in subject: | |
| 209 pass | |
| 210 else: | |
| 211 del results[subject] | |
| 212 continue | |
| 213 if filter_term_out and filter_term_out in subject: | |
| 214 del results[subject] | |
| 215 continue | |
| 216 return results | |
| 217 | |
| 218 F = open(F, "w") | |
| 219 Fasta = open(Fasta, "w") | |
| 220 blasted_transcripts = [] | |
| 221 filter_results(results, filter_relativeCov, filter_maxScore, | |
| 222 filter_meanScore, filter_term_in, filter_term_out) | |
| 223 for subject in results: | |
| 224 for transcript in Xblastdict[subject]: | |
| 225 blasted_transcripts.append(transcript) | |
| 226 blasted_transcripts = list(set(blasted_transcripts)) | |
| 227 if mode == "verbose": | |
| 228 F.write("--- %s ---\n" % dataset_name) | |
| 229 F.write("# %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ("SeqId", "%Identity", | |
| 230 "AlignLength", | |
| 231 "StartSubject", | |
| 232 "EndSubject", | |
| 233 "%QueryHitCov", | |
| 234 "E-value", | |
| 235 "BitScore")) | |
| 236 for subject in sorted(results, | |
| 237 key=lambda x: results[x]["meanBitScores"], | |
| 238 reverse=True): | |
| 239 F.write(" \n# %s\n" % subject) | |
| 240 F.write("# Suject Length: %s\n" % | |
| 241 results[subject]["subjectLength"]) | |
| 242 F.write("# Total Subject Coverage: %s\n" % | |
| 243 results[subject]["TotalCoverage"]) | |
| 244 F.write("# Relative Subject Coverage: %s\n" % | |
| 245 results[subject]["RelativeSubjectCoverage"]) | |
| 246 F.write("# Best Bit Score: %s\n" % results[subject][ | |
| 247 "maxBitScores"]) | |
| 248 F.write("# Mean Bit Score: %s\n" % results[subject][ | |
| 249 "meanBitScores"]) | |
| 250 for header in results[subject]["HitDic"]: | |
| 251 Fasta.write(">%s\n%s\n" % (header, | |
| 252 insert_newlines(results[subject][ | |
| 253 "HitDic"][ | |
| 254 header]))) | |
| 255 Fasta.write("\n") # final carriage return for the sequence | |
| 256 for transcript in Xblastdict[subject]: | |
| 257 transcriptSize = float(len(fastadict[transcript])) | |
| 258 for hit in Xblastdict[subject][transcript]: | |
| 259 percentIdentity = hit[0] | |
| 260 alignLenght = hit[1] | |
| 261 subjectStart = hit[6] | |
| 262 subjectEnd = hit[7] | |
| 263 queryCov = "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) | |
| 264 Eval, BitScore = hit[8], hit[9] | |
| 265 info = [transcript] + [percentIdentity, alignLenght, | |
| 266 subjectStart, subjectEnd, queryCov, | |
| 267 Eval, BitScore] | |
| 268 info = [str(i) for i in info] | |
| 269 info = "\t".join(info) | |
| 270 F.write("%s\n" % info) | |
| 271 else: | |
| 272 F.write("--- %s ---\n" % dataset_name) | |
| 273 F.write("# subject\tsubject length\tTotal Subject Coverage\tRelative\ | |
| 274 Subject Coverage\tBest Bit Score\tMean Bit Score\n") | |
| 275 for subject in sorted(results, | |
| 276 key=lambda x: results[x]["meanBitScores"], | |
| 277 reverse=True): | |
| 278 line = [] | |
| 279 line.append(subject) | |
| 280 line.append(results[subject]["subjectLength"]) | |
| 281 line.append(results[subject]["TotalCoverage"]) | |
| 282 line.append(results[subject]["RelativeSubjectCoverage"]) | |
| 283 line.append(results[subject]["maxBitScores"]) | |
| 284 line.append(results[subject]["meanBitScores"]) | |
| 285 line = [str(i) for i in line] | |
| 286 F.write("%s\n" % "\t".join(line)) | |
| 287 for header in results[subject]["HitDic"]: | |
| 288 Fasta.write(">%s\n%s\n" % (header, | |
| 289 insert_newlines( | |
| 290 results[subject][ | |
| 291 "HitDic"][header]))) | |
| 292 Fasta.write("\n") # final carriage return for the sequence | |
| 293 F.close() | |
| 294 Fasta.close() | |
| 295 return blasted_transcripts | |
| 296 | |
| 297 | |
| 298 def dispatch_sequences(fastadict, blasted_transcripts, matched_sequences, | |
| 299 unmatched_sequences): | |
| 300 '''to output the sequences that matched and did not matched in the blast''' | |
| 301 F_matched = open(matched_sequences, "w") | |
| 302 F_unmatched = open(unmatched_sequences, "w") | |
| 303 for transcript in fastadict: | |
| 304 if transcript in blasted_transcripts: | |
| 305 ''''list of blasted_transcripts is generated | |
| 306 by the outputParsing function''' | |
| 307 F_matched.write(">%s\n%s\n" % (transcript, insert_newlines( | |
| 308 fastadict[transcript]))) | |
| 309 else: | |
| 310 F_unmatched.write(">%s\n%s\n" % (transcript, insert_newlines( | |
| 311 fastadict[transcript]))) | |
| 312 F_matched.close() | |
| 313 F_unmatched.close() | |
| 314 return | |
| 315 | |
| 316 | |
| 317 def __main__(): | |
| 318 args = Parser() | |
| 319 fastadict = getfasta(args.sequences) | |
| 320 Xblastdict = getblast(args.blast) | |
| 321 results = defaultdict(dict) | |
| 322 for subject in Xblastdict: | |
| 323 results[subject]["HitDic"], results[subject]["subjectLength"], results[ | |
| 324 subject]["TotalCoverage"], results[subject][ | |
| 325 "RelativeSubjectCoverage"], results[subject][ | |
| 326 "maxBitScores"], results[subject][ | |
| 327 "meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, | |
| 328 args.flanking) | |
| 329 blasted_transcripts = outputParsing( | |
| 330 args.dataset_name, args.tabularOutput, | |
| 331 args.fastaOutput, results, Xblastdict, fastadict, | |
| 332 filter_relativeCov=args.filter_relativeCov, | |
| 333 filter_maxScore=args.filter_maxScore, | |
| 334 filter_meanScore=args.filter_meanScore, | |
| 335 filter_term_in=args.filter_term_in, | |
| 336 filter_term_out=args.filter_term_out, mode=args.mode) | |
| 337 dispatch_sequences(fastadict, blasted_transcripts, args.al_sequences, | |
| 338 args.un_sequences) | |
| 339 | |
| 340 | |
| 341 if __name__ == "__main__": | |
| 342 __main__() |
