Mercurial > repos > drosofff > msp_blastparser_and_hits
diff BlastParser_and_hits.py @ 1:1964514aabde draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
author | drosofff |
---|---|
date | Mon, 14 Sep 2015 12:18:46 -0400 |
parents | 69ea2a13947f |
children | bb0d4cd765c5 |
line wrap: on
line diff
--- a/BlastParser_and_hits.py Sun Jun 21 14:31:29 2015 -0400 +++ b/BlastParser_and_hits.py Mon Sep 14 12:18:46 2015 -0400 @@ -14,6 +14,11 @@ the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") + the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)") + the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number") + the_parser.add_argument('--filter_meanScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number") + the_parser.add_argument('--al_sequences', action="store", type=str, help="sequences that have been blast aligned") + the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned") args = the_parser.parse_args() if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): the_parser.error('argument(s) missing, call the -h option of the script') @@ -122,12 +127,14 @@ leftCoordinate = 1 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) -def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"): +def outputParsing (F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, mode="verbose"): F= open(F, "w") Fasta=open(Fasta, "w") if mode == "verbose": print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore: + continue print >> F, "#\n# %s" % subject print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) @@ -149,6 +156,8 @@ else: print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score" for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore: + continue line = [] line.append(subject) line.append(results[subject]["subjectLength"]) @@ -164,14 +173,33 @@ F.close() Fasta.close() - +def sort_sequences (fastadict, blastdict, matched_sequences, unmatched_sequences): + '''to output the sequences that matched and did not matched in the blast''' + blasted_transcripts = [] + for subject in blastdict: + for transcript in blastdict[subject]: + blasted_transcripts.append(transcript) + blasted_transcripts = list( set( blasted_transcripts)) + F_matched = open (matched_sequences, "w") + F_unmatched = open (unmatched_sequences, "w") + for transcript in fastadict: + if transcript in blasted_transcripts: + print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) + else: + print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) ) + F_matched.close() + F_unmatched.close() + return def __main__ (): args = Parser() fastadict = getfasta (args.sequences) Xblastdict = getblast (args.blast) + sort_sequences (fastadict, Xblastdict, args.al_sequences, args.un_sequences) results = defaultdict(dict) for subject in Xblastdict: results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) - outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode) + outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, + filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore, + filter_meanScore=args.filter_meanScore, mode=args.mode) if __name__=="__main__": __main__()