Mercurial > repos > artbio > blastparser_and_hits
diff BlastParser_and_hits.py @ 0:9dfb65ebb02e draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastparser_and_hits commit 48132e5edac97d54804ccbaf620068a5fb800bdc
author | artbio |
---|---|
date | Sun, 15 Oct 2017 18:43:37 -0400 |
parents | |
children | 36103afa0934 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BlastParser_and_hits.py Sun Oct 15 18:43:37 2017 -0400 @@ -0,0 +1,342 @@ +#!/usr/bin/python +import argparse +from collections import defaultdict + + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--blast', action="store", type=str, + help="Path to the blast output\ + (tabular format, 12 column)") + the_parser.add_argument('--sequences', action="store", type=str, + help="Path to the fasta file with blasted\ + sequences") + the_parser.add_argument('--fastaOutput', action="store", type=str, + help="fasta output file of blast hits") + the_parser.add_argument('--tabularOutput', action="store", type=str, + help="tabular output file of blast analysis") + the_parser.add_argument('--flanking', action="store", type=int, + help="number of flanking nucleotides\ + added to the hit sequences") + the_parser.add_argument('--mode', action="store", + choices=["verbose", "short"], type=str, + help="reporting (verbose) or not reporting (short)\ + oases contigs") + the_parser.add_argument('--filter_relativeCov', action="store", type=float, + default=0, + help="filter out relative coverages\ + below the specified ratio (float number)") + the_parser.add_argument('--filter_maxScore', action="store", type=float, + default=0, help="filter out best BitScores below\ + the specified float number") + the_parser.add_argument('--filter_meanScore', action="store", type=float, + default=0, + help="filter out mean BitScores below the\ + specified float number") + the_parser.add_argument('--filter_term_in', action="store", type=str, + default="", + help="select the specified term in the\ + subject list") + the_parser.add_argument('--filter_term_out', action="store", type=str, + default="", + help="exclude the specified term from\ + the subject list") + the_parser.add_argument('--al_sequences', action="store", type=str, + help="sequences that have been blast aligned") + the_parser.add_argument('--un_sequences', action="store", type=str, + help="sequences that have not been blast aligned") + the_parser.add_argument('--dataset_name', action="store", type=str, + default="", + help="the name of the dataset that has been parsed,\ + to be reported in the output") + args = the_parser.parse_args() + if not all((args.sequences, args.blast, args.fastaOutput, + args.tabularOutput)): + the_parser.error('argument(s) missing, call the\ + -h option of the script') + if not args.flanking: + args.flanking = 0 + return args + + +def median(lst): + lst = sorted(lst) + if len(lst) < 1: + return None + if len(lst) % 2 == 1: + return lst[((len(lst)+1)/2)-1] + if len(lst) % 2 == 0: + return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 + + +def mean(lst): + if len(lst) < 1: + return 0 + return sum(lst) / float(len(lst)) + + +def getfasta(fastafile): + fastadic = {} + for line in open(fastafile): + if line[0] == ">": + header = line[1:-1] + fastadic[header] = "" + else: + fastadic[header] += line + for header in fastadic: + fastadic[header] = "".join(fastadic[header].split("\n")) + return fastadic + + +def insert_newlines(string, every=60): + lines = [] + for i in range(0, len(string), every): + lines.append(string[i:i+every]) + return '\n'.join(lines) + + +def getblast(blastfile): + '''blastinfo [0] Percentage of identical matches + blastinfo [1] Alignment length + blastinfo [2] Number of mismatches + blastinfo [3] Number of gap openings + blastinfo [4] Start of alignment in query + blastinfo [5] End of alignment in query + blastinfo [6] Start of alignment in subject (database hit) + blastinfo [7] End of alignment in subject (database hit) + blastinfo [8] Expectation value (E-value) + blastinfo [9] Bit score + blastinfo [10] Subject length + (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' + blastdic = defaultdict(dict) + for line in open(blastfile): + fields = line[:-1].split("\t") + transcript = fields[0] + subject = fields[1] + # blastinfo[0] + blastinfo = [float(fields[2])] + # blastinfo[1:8] insets 1 to 7 + blastinfo = blastinfo + [int(i) for i in fields[3:10]] + # blastinfo[8] E-value remains as a string type + blastinfo.append(fields[10]) + # blastinfo[9] Bit score + blastinfo.append(float(fields[11])) + # blastinfo[10] Subject length MUST BE RETRIEVED + # THROUGH A 13 COLUMN BLAST OUTPUT + blastinfo.append(int(fields[12])) + try: + blastdic[subject][transcript].append(blastinfo) + except Exception: + blastdic[subject][transcript] = [blastinfo] + return blastdic + + +def getseq(fastadict, transcript, up, down, orientation="direct"): + def reverse(seq): + revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} + revseq = [revdict[i] for i in seq[::-1]] + return "".join(revseq) + pickseq = fastadict[transcript][up-1:down] + if orientation == "direct": + return pickseq + else: + return reverse(pickseq) + + +def subjectCoverage(fastadict, blastdict, subject, + QueriesFlankingNucleotides=0): + SubjectCoverageList = [] + HitDic = {} + bitScores = [] + for transcript in blastdict[subject]: + prefix = "%s--%s_" % (subject, transcript) + hitNumber = 0 + for hit in blastdict[subject][transcript]: + hitNumber += 1 + suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, + hit[0], + hit[1], + hit[8]) + # query coverage by a hit is in hit[4:6] + HitDic[prefix+suffix] = GetHitSequence(fastadict, transcript, + hit[4], hit[5], + QueriesFlankingNucleotides) + # subject coverage by a hit is in hit[6:8] + SubjectCoverageList += range(min([hit[6], hit[7]]), + max([hit[6], hit[7]]) + 1) + bitScores.append(hit[9]) + # always the same value for a given subject. Stupid but simple + subjectLength = hit[10] + TotalSubjectCoverage = len(set(SubjectCoverageList)) + RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) + return (HitDic, subjectLength, TotalSubjectCoverage, + RelativeSubjectCoverage, max(bitScores), mean(bitScores)) + + +def GetHitSequence(fastadict, FastaHeader, leftCoordinate, rightCoordinate, + FlankingValue): + if rightCoordinate > leftCoordinate: + polarity = "direct" + else: + polarity = "reverse" + leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate + if leftCoordinate - FlankingValue > 0: + leftCoordinate -= FlankingValue + else: + leftCoordinate = 1 + return getseq(fastadict, FastaHeader, leftCoordinate, rightCoordinate, + polarity) + + +def outputParsing(dataset_name, F, Fasta, results, Xblastdict, fastadict, + filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, + filter_term_in="", filter_term_out="", mode="verbose"): + def filter_results(results, filter_relativeCov=0, filter_maxScore=0, + filter_meanScore=0, filter_term_in="", + filter_term_out=""): + for subject in results.keys(): + if results[subject][ + "RelativeSubjectCoverage"] < filter_relativeCov: + del results[subject] + continue + if results[subject]["maxBitScores"] < filter_maxScore: + del results[subject] + continue + if results[subject]["meanBitScores"] < filter_meanScore: + del results[subject] + continue + if filter_term_in in subject: + pass + else: + del results[subject] + continue + if filter_term_out and filter_term_out in subject: + del results[subject] + continue + return results + + F = open(F, "w") + Fasta = open(Fasta, "w") + blasted_transcripts = [] + filter_results(results, filter_relativeCov, filter_maxScore, + filter_meanScore, filter_term_in, filter_term_out) + for subject in results: + for transcript in Xblastdict[subject]: + blasted_transcripts.append(transcript) + blasted_transcripts = list(set(blasted_transcripts)) + if mode == "verbose": + F.write("--- %s ---\n" % dataset_name) + F.write("# %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ("SeqId", "%Identity", + "AlignLength", + "StartSubject", + "EndSubject", + "%QueryHitCov", + "E-value", + "BitScore")) + for subject in sorted(results, + key=lambda x: results[x]["meanBitScores"], + reverse=True): + F.write(" \n# %s\n" % subject) + F.write("# Suject Length: %s\n" % + results[subject]["subjectLength"]) + F.write("# Total Subject Coverage: %s\n" % + results[subject]["TotalCoverage"]) + F.write("# Relative Subject Coverage: %s\n" % + results[subject]["RelativeSubjectCoverage"]) + F.write("# Best Bit Score: %s\n" % results[subject][ + "maxBitScores"]) + F.write("# Mean Bit Score: %s\n" % results[subject][ + "meanBitScores"]) + for header in results[subject]["HitDic"]: + Fasta.write(">%s\n%s\n" % (header, + insert_newlines(results[subject][ + "HitDic"][ + header]))) + Fasta.write("\n") # final carriage return for the sequence + for transcript in Xblastdict[subject]: + transcriptSize = float(len(fastadict[transcript])) + for hit in Xblastdict[subject][transcript]: + percentIdentity = hit[0] + alignLenght = hit[1] + subjectStart = hit[6] + subjectEnd = hit[7] + queryCov = "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) + Eval, BitScore = hit[8], hit[9] + info = [transcript] + [percentIdentity, alignLenght, + subjectStart, subjectEnd, queryCov, + Eval, BitScore] + info = [str(i) for i in info] + info = "\t".join(info) + F.write("%s\n" % info) + else: + F.write("--- %s ---\n" % dataset_name) + F.write("# subject\tsubject length\tTotal Subject Coverage\tRelative\ + Subject Coverage\tBest Bit Score\tMean Bit Score\n") + for subject in sorted(results, + key=lambda x: results[x]["meanBitScores"], + reverse=True): + line = [] + line.append(subject) + line.append(results[subject]["subjectLength"]) + line.append(results[subject]["TotalCoverage"]) + line.append(results[subject]["RelativeSubjectCoverage"]) + line.append(results[subject]["maxBitScores"]) + line.append(results[subject]["meanBitScores"]) + line = [str(i) for i in line] + F.write("%s\n" % "\t".join(line)) + for header in results[subject]["HitDic"]: + Fasta.write(">%s\n%s\n" % (header, + insert_newlines( + results[subject][ + "HitDic"][header]))) + Fasta.write("\n") # final carriage return for the sequence + F.close() + Fasta.close() + return blasted_transcripts + + +def dispatch_sequences(fastadict, blasted_transcripts, matched_sequences, + unmatched_sequences): + '''to output the sequences that matched and did not matched in the blast''' + F_matched = open(matched_sequences, "w") + F_unmatched = open(unmatched_sequences, "w") + for transcript in fastadict: + if transcript in blasted_transcripts: + ''''list of blasted_transcripts is generated + by the outputParsing function''' + F_matched.write(">%s\n%s\n" % (transcript, insert_newlines( + fastadict[transcript]))) + else: + F_unmatched.write(">%s\n%s\n" % (transcript, insert_newlines( + fastadict[transcript]))) + F_matched.close() + F_unmatched.close() + return + + +def __main__(): + args = Parser() + fastadict = getfasta(args.sequences) + Xblastdict = getblast(args.blast) + results = defaultdict(dict) + for subject in Xblastdict: + results[subject]["HitDic"], results[subject]["subjectLength"], results[ + subject]["TotalCoverage"], results[subject][ + "RelativeSubjectCoverage"], results[subject][ + "maxBitScores"], results[subject][ + "meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, + args.flanking) + blasted_transcripts = outputParsing( + args.dataset_name, args.tabularOutput, + args.fastaOutput, results, Xblastdict, fastadict, + filter_relativeCov=args.filter_relativeCov, + filter_maxScore=args.filter_maxScore, + filter_meanScore=args.filter_meanScore, + filter_term_in=args.filter_term_in, + filter_term_out=args.filter_term_out, mode=args.mode) + dispatch_sequences(fastadict, blasted_transcripts, args.al_sequences, + args.un_sequences) + + +if __name__ == "__main__": + __main__()