Mercurial > repos > drosofff > blastx_to_scaffold
changeset 0:a2e034f1638e draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Sun, 21 Jun 2015 14:40:10 -0400 |
parents | |
children | 940c0c669e96 |
files | blastx_to_scaffold.py blastx_to_scaffold.xml test-data/blastx.tab test-data/contigs.fa test-data/scaffold.fa |
diffstat | 5 files changed, 251 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blastx_to_scaffold.py Sun Jun 21 14:40:10 2015 -0400 @@ -0,0 +1,66 @@ +#!/usr/bin/python +import sys +import argparse + +def insert_newlines(string, every=60): + lines = [] + for i in xrange(0, len(string), every): + lines.append(string[i:i+every]) + return '\n'.join(lines) + +def Parser(): + the_parser = argparse.ArgumentParser( + description="Generate DNA scaffold from blastx alignment of Contigs") + the_parser.add_argument( + '--sequences', action="store", type=str, help="input sequence file in fasta format") + the_parser.add_argument( + '--blastx-tab', dest="blastx_tab", action="store", type=str, help="13-columns tabular blastx output") + the_parser.add_argument( + '--output', action="store", type=str, help="output file path, fasta format") + args = the_parser.parse_args() + return args + +def __main__(): + args = Parser() + protLenght = int (open (args.blastx_tab, "r").readline().split("\t")[12]) + BlastxOutput = open (args.blastx_tab, "r") + Contigs = open (args.sequences, "r") + ContigsDict = {} + protScaffold = {} + + for line in Contigs: + if line[0] == ">": + header = line[1:-1] + ContigsDict[header] = "" + else: + ContigsDict[header] += line[:-1] + + protScaffold = dict ( [(i,"NNN") for i in range (1, protLenght+1)] ) + + for line in BlastxOutput: + fields = line[:-1].split("\t") + queryStart = int(fields[6]) + queryStop = int(fields[7]) + subjectStart = int(fields[8]) + subjectStop = int(fields[9]) + seqHeader = fields[0] + sequence = ContigsDict[seqHeader] + for i in range (subjectStart, subjectStop): + del protScaffold[i] + protScaffold[subjectStop] = ContigsDict[seqHeader][queryStart -1: queryStop] + + finalSeqList = [] + for i in sorted (protScaffold): + finalSeqList.append(protScaffold[i]) + finalSequence = insert_newlines("".join(finalSeqList)) + + Out = open (args.output, "w") + print >> Out, ">Scaffold" + print >> Out, finalSequence + + BlastxOutput.close() + Contigs.close() + Out.close() + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blastx_to_scaffold.xml Sun Jun 21 14:40:10 2015 -0400 @@ -0,0 +1,42 @@ +<tool id="blastx2scaffold" name="blastx_to_scaffold" version="0.9.0"> +<description>Generate DNA scaffold from blastx alignment of Contigs</description> +<requirements> +</requirements> +<command interpreter="python"> + blastx_to_scaffold.py --sequences $sequences + --blastx-tab $blastx_tab + --output $output +</command> +<inputs> +<param name="sequences" type="data" format="fasta" label="Select a fasta contigs file"/> +<param name="blastx_tab" type="data" format="tabular" label="Select a blastx output from your history" help="must have 13 columns with column 13 containing the subject lenght, other columns are standard"/> + +</inputs> +<outputs> + <data format="fasta" name="output"/> +</outputs> + + +<tests> + <test> + <param name="sequences" value="contigs.fa" ftype="fasta"/> + <param name="blastx_tab" value="blastx.tab" ftype="tabular"/> + <output name="output" file="scaffold.fa" ftype="fasta"/> + </test> +</tests> + + +<help> + + +**What it Does** +This tool start from DNA contigs that aligned to a subject protein sequence through blastx. +The contigs must be provided in fasta format. The blastx output must be tabular, the 12 standard column plus column 13 with the length of the blastx subject. +The final scaffold is a DNA sequence. +Sequences of the subject protein which were not aligned to the contigs are replaced by Ns in this scaffold. + +**Attribution** +This Galaxy tool was created by drosofff@gmail.com on 28/05/2015 +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastx.tab Sun Jun 21 14:40:10 2015 -0400 @@ -0,0 +1,2 @@ +Contig1 gi|81971654|sp|Q9IJX4.1|POLN_CRPVC_RecName:_Full_Replicase_polyprotein;_Contains:_RecName:_Full_Pro 43.46 451 247 3 6 1343 237 684 6e-128 397 1771 +Contig2 gi|81971654|sp|Q9IJX4.1|POLN_CRPVC_RecName:_Full_Replicase_polyprotein;_Contains:_RecName:_Full_Pro 52.99 536 236 4 6 1571 1217 1750 0.0 580 1771
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/contigs.fa Sun Jun 21 14:40:10 2015 -0400 @@ -0,0 +1,52 @@ +>Contig1 +TAGATAAGGTTTGCTCATTTCTTGAGGATACTTTACCAGGTATGGTCGAGCACGTTACGC +TCGTAGCACAAAATACATCCGCGTCAGCCAAGGTGTTATCTGACGAGTTGATCAAATCAA +TGCTTTGCATTGTTTTGATTTGCTTGTTGATTGAAACCAAGTTCTATAAGACCGCTTTCG +CGGTACTTATAGTGGTTGCTCTACGTGTTTTCGGGTACAGTGAGCAAATAATTGAGACAG +CTATGGACATGTATCGCGTAATTAGGGCTCCAAAGGCTCAAGGTAATATGGAAGATGTCG +TTTTCCATCCGTGGTTGAACACGTGTGGAAAGTTGATTTTCCTACTTATCGCTGTCCTGT +GTCTCAAGAAATTACCAGGAAAGAACGACGTAGACACTTTCATGCGCAGGCTCGACAGCT +TACCCAAAGCTGTTAAGGGTGCGACACAACTACATGAATGGGTGTCAAAATACTTCGATC +TCTCTTTGGATCACGTCAAGGCGATGATTGTTGGTAAATCTTGTGCCGAAATGAAGAAGG +CTGAATCATCAAGCGCCAAAGTTTTGGCTTGGGCCGCTAGAGTTCAAGATTTCGTCAGAT +TGGAACAACGAAGTAAAATCGATAGTGATATCGCTGTCGCCAACGAGGCTGAAGCCTTGT +ATCACACTGGATTGCAATTTGCAGGAGACACTCTGTTACCTCCAGAATTGCACAAGGTCG +TGAACAGTGCGCTAAGACCAGCCCGCGATATATATGAGTACGTCACCCGCTCCCCAATAA +AAGGAGGGAGTCGTAAGATGAGACCCTTGATGATTTGGCTAGCTGGCCAGTCAGGAATTG +GGAAAACCTCTATGGTGGATCCTCTATGTATCGATTTGCTTCGAGCAATGGGTTATGTGG +GACCTGAACATCTCCACTCGTTGGTGTATGGCCGCCAAGTTGAGACGGAGTACTGGGATG +GTTACAAAGCCCACAAGATAGTGATCTATGATGATGCTTTTCAGCTGAAAGATGATGCTG +TGAACAGGAATTTGGAGGTATTTGAGGTTATACGTTCTTGCAACACGTATCCTCAACACC +TTCATATGGCTTGTCTCTCGGATAAAAACACTTTTTCAGTAGCGGAAGTGTACATCTACA +CTACCAACGAAATGAATGTCAAACTTGAGTCGCTGACTCATGAACAAGCATTCTACAACC +GCATGAGTGAAAACGCGTTCACTGTGCGTCCAAAAGAGGCTTATCGTCTAGTCGAAGAAG +GATCAACCGGCAATAAGCAGTATCGTTTGGACAAAACGAAAACCAAAGGAGCTATCGATC +TCGATGTGTACGAATTCGTGCGC +>Contig2 +CTAGAATCACAGCTCAGATGAGTTTTGAGGCACCGAAGGACGCAATTGAAGGACCGTGTC +AAACCCCGGAAGGATTGTTCGCCCCTATTGGCAAAGCGCCGATAGGCGTAGGGATGTCCA +CCAAGACGGCTATACGACCTTCACGCTTGTATGGAAGAATAACTAAACCGACAACTGCAC +CATCATACTTGGGTAAAGACGCGCTTTATCGTGGATTGACCAAGTGTGGTGTTCGCACAG +TTAATATTCAACCAGAATACATAGACGCAGCGGCGAATGACGTCGCACGCTATGTGTTAA +ACCAGCATGTTGGTCACGTGGATAGGGAACGATACACACGTATATTGTCGTACGAGGAGG +CTGTTAAGGGCGTGCCGTACGATGATTTCATGAAGTCAGTGACTCGAGTCACTTCCCCTG +GTTACCCCTATTGCTTGGATACTGGAAACATGCCAGGGAAAAGCAAATGGATGGGGCTCG +AACAAGATTTCGATATGACAAGTCCAGCTGCTTTGGCTTTGAGGAAAGATGTTGAAAGTT +TGTTGGAAGATTGCAAAAATGGCTTAGTCCGTGATGTGGTGTTTGTCGACACTCTCAAGG +ATGAAAGGCGCGAGCTGATAAAGGTGGAAGCAAAGAAGACTCGAGTCTTTTCTGCTGGAC +CACAGCATTTTGTAATAGCTTTCCGGCAATACTTTCTTCCATTCTCTGCCTGGGTCATGC +ATAACAGAATCGAAAACGAAGTAGCCGTTGGAACAAACCCCTTCTCAATGGATTGGCACA +ACATTGCTGTGCGTATGCGTAGTAAAGGGAGACACATTATTGCTGGAGATTTTAGCAATT +TTGATGGATCCCTCAACGCCCAAGTTCTCTGGACAATATTTTGGAAGATATTTGTCCCGT +GGCTTAATGATATTGAACCACTTGGTACACCCAAGAATGAGGAGAATCTGCGGGTCTGCA +CGAGTCTATGGACGCACTTGGTGCACTCCGTGCACATTTGTGGAGATAACTTGTACATGT +GGACACATTCTCAACCATCGGGCAATCCCTTCACGGTGATAATCAATAGTTTGTATAACT +CAGTTATCATGCGTGTCGTGTGGCAATACATAATGGCGAAAGAAGAACCTAAGTTACGCA +CAATGAACCATTTCAATCAACATGTTGCTATGGTTTCATATGGTGATGACAATCTACTTA +ACATCTCGGAAGGGGTAATTGATATCTTCAACCAACTTACCATCTCGGAAGCCATGCGTT +GGATAGGACACGAATACACAGATGAAACGAAAACAGGCGAGGCTGCGCCCTATCGGACAT +TGGAAGAAGTCCGTTTCCTTAAAAGAGGGTTCAGAATGGATCACCTCTTGTGTCGGTGGG +TAGCTCCTTTGAAGAAGGATGTCATCTACGAAATGCTTAATTGGACGCGCAAAGGGATTA +ACCCAGATGATGTGACGATGATGATCATTGATACAGCATTTAGGGAGATCTCTTATCACG +GAAGGGAAGCTTTCGAGAAGCTGCGAGGGCAGATACTTGAGCAGCGGGATGTGTTGGTTG +AATATCCTCAA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/scaffold.fa Sun Jun 21 14:40:10 2015 -0400 @@ -0,0 +1,89 @@ +>Scaffold +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGGTTTGCTCA +TTTCTTGAGGATACTTTACCAGGTATGGTCGAGCACGTTACGCTCGTAGCACAAAATACA +TCCGCGTCAGCCAAGGTGTTATCTGACGAGTTGATCAAATCAATGCTTTGCATTGTTTTG +ATTTGCTTGTTGATTGAAACCAAGTTCTATAAGACCGCTTTCGCGGTACTTATAGTGGTT +GCTCTACGTGTTTTCGGGTACAGTGAGCAAATAATTGAGACAGCTATGGACATGTATCGC +GTAATTAGGGCTCCAAAGGCTCAAGGTAATATGGAAGATGTCGTTTTCCATCCGTGGTTG +AACACGTGTGGAAAGTTGATTTTCCTACTTATCGCTGTCCTGTGTCTCAAGAAATTACCA +GGAAAGAACGACGTAGACACTTTCATGCGCAGGCTCGACAGCTTACCCAAAGCTGTTAAG +GGTGCGACACAACTACATGAATGGGTGTCAAAATACTTCGATCTCTCTTTGGATCACGTC +AAGGCGATGATTGTTGGTAAATCTTGTGCCGAAATGAAGAAGGCTGAATCATCAAGCGCC +AAAGTTTTGGCTTGGGCCGCTAGAGTTCAAGATTTCGTCAGATTGGAACAACGAAGTAAA +ATCGATAGTGATATCGCTGTCGCCAACGAGGCTGAAGCCTTGTATCACACTGGATTGCAA +TTTGCAGGAGACACTCTGTTACCTCCAGAATTGCACAAGGTCGTGAACAGTGCGCTAAGA +CCAGCCCGCGATATATATGAGTACGTCACCCGCTCCCCAATAAAAGGAGGGAGTCGTAAG +ATGAGACCCTTGATGATTTGGCTAGCTGGCCAGTCAGGAATTGGGAAAACCTCTATGGTG +GATCCTCTATGTATCGATTTGCTTCGAGCAATGGGTTATGTGGGACCTGAACATCTCCAC +TCGTTGGTGTATGGCCGCCAAGTTGAGACGGAGTACTGGGATGGTTACAAAGCCCACAAG +ATAGTGATCTATGATGATGCTTTTCAGCTGAAAGATGATGCTGTGAACAGGAATTTGGAG +GTATTTGAGGTTATACGTTCTTGCAACACGTATCCTCAACACCTTCATATGGCTTGTCTC +TCGGATAAAAACACTTTTTCAGTAGCGGAAGTGTACATCTACACTACCAACGAAATGAAT +GTCAAACTTGAGTCGCTGACTCATGAACAAGCATTCTACAACCGCATGAGTGAAAACGCG +TTCACTGTGCGTCCAAAAGAGGCTTATCGTCTAGTCGAAGAAGGATCAACCGGCAATAAG +CAGTATCGTTTGGACAAAACGAAAACCAAAGGAGCTATCGATCTCGATGTGTACGAATTC +GTGCGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCACAGCTCAGATGAGT +TTTGAGGCACCGAAGGACGCAATTGAAGGACCGTGTCAAACCCCGGAAGGATTGTTCGCC +CCTATTGGCAAAGCGCCGATAGGCGTAGGGATGTCCACCAAGACGGCTATACGACCTTCA +CGCTTGTATGGAAGAATAACTAAACCGACAACTGCACCATCATACTTGGGTAAAGACGCG +CTTTATCGTGGATTGACCAAGTGTGGTGTTCGCACAGTTAATATTCAACCAGAATACATA +GACGCAGCGGCGAATGACGTCGCACGCTATGTGTTAAACCAGCATGTTGGTCACGTGGAT +AGGGAACGATACACACGTATATTGTCGTACGAGGAGGCTGTTAAGGGCGTGCCGTACGAT +GATTTCATGAAGTCAGTGACTCGAGTCACTTCCCCTGGTTACCCCTATTGCTTGGATACT +GGAAACATGCCAGGGAAAAGCAAATGGATGGGGCTCGAACAAGATTTCGATATGACAAGT +CCAGCTGCTTTGGCTTTGAGGAAAGATGTTGAAAGTTTGTTGGAAGATTGCAAAAATGGC +TTAGTCCGTGATGTGGTGTTTGTCGACACTCTCAAGGATGAAAGGCGCGAGCTGATAAAG +GTGGAAGCAAAGAAGACTCGAGTCTTTTCTGCTGGACCACAGCATTTTGTAATAGCTTTC +CGGCAATACTTTCTTCCATTCTCTGCCTGGGTCATGCATAACAGAATCGAAAACGAAGTA +GCCGTTGGAACAAACCCCTTCTCAATGGATTGGCACAACATTGCTGTGCGTATGCGTAGT +AAAGGGAGACACATTATTGCTGGAGATTTTAGCAATTTTGATGGATCCCTCAACGCCCAA +GTTCTCTGGACAATATTTTGGAAGATATTTGTCCCGTGGCTTAATGATATTGAACCACTT +GGTACACCCAAGAATGAGGAGAATCTGCGGGTCTGCACGAGTCTATGGACGCACTTGGTG +CACTCCGTGCACATTTGTGGAGATAACTTGTACATGTGGACACATTCTCAACCATCGGGC +AATCCCTTCACGGTGATAATCAATAGTTTGTATAACTCAGTTATCATGCGTGTCGTGTGG +CAATACATAATGGCGAAAGAAGAACCTAAGTTACGCACAATGAACCATTTCAATCAACAT +GTTGCTATGGTTTCATATGGTGATGACAATCTACTTAACATCTCGGAAGGGGTAATTGAT +ATCTTCAACCAACTTACCATCTCGGAAGCCATGCGTTGGATAGGACACGAATACACAGAT +GAAACGAAAACAGGCGAGGCTGCGCCCTATCGGACATTGGAAGAAGTCCGTTTCCTTAAA +AGAGGGTTCAGAATGGATCACCTCTTGTGTCGGTGGGTAGCTCCTTTGAAGAAGGATGTC +ATCTACGAAATGCTTAATTGGACGCGCAAAGGGATTAACCCAGATGATGTGACGATGATG +ATCATTGATACAGCATTTAGGGAGATCTCTTATCACGGAAGGGAAGCTTTCGAGAAGCTG +CGAGGGCAGATACTTGAGCAGCGGGATGTGTTGGTTGAATATCCTCAANNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN