Mercurial > repos > xuebing > sharplab_seq_motif
changeset 14:d1f0f85ee5bc
Deleted selected files
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 08:52:46 -0400 |
parents | 5a93fe53194d |
children | 0e221dbd17b2 |
files | mytools/dreme.xml mytools/fasta-dinucleotide-shuffle.py mytools/fastamarkov.xml mytools/fastashuffle1.xml mytools/fastashuffle2.xml mytools/fastqdump.xml mytools/fimo2.xml mytools/fimo2bed.py mytools/fimo2bed.xml mytools/iupac2meme.xml mytools/match.xml mytools/meme.xml mytools/memelogo.xml mytools/revcompl.py mytools/revcompl.xml mytools/seq2meme.py mytools/seq2meme.xml mytools/seqshuffle.py mytools/sequence.py mytools/splicesite.xml |
diffstat | 20 files changed, 0 insertions(+), 2033 deletions(-) [+] |
line wrap: on
line diff
--- a/mytools/dreme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -<tool id="dreme" name="DREME"> - <description>short motif discovery</description> - <command interpreter="python">/Users/xuebing/bin/dreme.py -p $input -png -e $ethresh - #if $background_select.bg_select == "fromfile": - -n "${bgfile}" - #end if - - && mv dreme_out/dreme.html ${html_outfile} - - && mv dreme_out/dreme.txt ${txt_outfile} - - && mv dreme_out/dreme.xml ${xml_outfile} - - && rm -rf dreme_out - - </command> - <inputs> - <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/> - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background sequence" > - <option value="shuffle" selected="true">shuffle the orignal sequence</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="fasta" label="Background sequence file (FASTA)"/> - </when> - </conditional> - - <param name="ethresh" size="10" type="float" value="0.05" label="E-value threshold"/> - </inputs> - <outputs> - - <data format="xml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (motif)"/> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - </outputs> - <help> - -**What it does** - -http://meme.sdsc.edu/meme/doc/dreme.html - -DREME (Discriminative Regular Expression Motif Elicitation) finds relatively short motifs (up to 8 bases) fast, and can perform discriminative motif discovery if given a negative set, consisting of sequences unlikely to contain a motif of interest that is however likely to be found in the main ("positive") sequence set. If you do not provide a negative set the program shuffles the positive set to provide a background (in the role of the negative set). - -The input to DREME is one or two sets of DNA sequences. The program uses a Fisher Exact Test to determine significance of each motif found in the postive set as compared with its representation in the negative set, using a significance threshold that may be set on the command line. - -DREME achieves its high speed by restricting its search to regular expressions based on the IUPAC alphabet representing bases and ambiguous characters, and by using a heuristic estimate of generalised motifs' statistical significance. - - </help> -</tool>
--- a/mytools/fasta-dinucleotide-shuffle.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/python - -import sys, string, random -import sequence - -# -# turn on psyco to speed up by 3X -# -if __name__=='__main__': - try: - import psyco - #psyco.log() - psyco.full() - psyco_found = True - except ImportError: -# psyco_found = False - pass -# print >> sys.stderr, "psyco_found", psyco_found - - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 - -def computeCountAndLists(s): - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - # FIXME: is this ok? - List['N'] = [] - nuclList = ["A","C","G","T","N"] - s = s.upper() - #s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - numerator += dinuclCnt[x]['T'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['T'] -= 1 - return 'T' - dinuclCnt[x]['N'] -= 1 - return 'N' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(3): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T","N"]: - if x in s: nuclList.append(x) - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - -def main(): - - # - # defaults - # - file_name = None - seed = 1 - copies = 1 - - # - # get command line arguments - # - usage = """USAGE: - %s [options] - - -f <filename> file name (required) - -t <tag> added to shuffled sequence names - -s <seed> random seed; default: %d - -c <n> make <n> shuffled copies of each sequence; default: %d - -h print this usage message - """ % (sys.argv[0], seed, copies) - - # no arguments: print usage - if len(sys.argv) == 1: - print >> sys.stderr, usage; sys.exit(1) - - tag = ""; - - # parse command line - i = 1 - while i < len(sys.argv): - arg = sys.argv[i] - if (arg == "-f"): - i += 1 - try: file_name = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-t"): - i += 1 - try: tag = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-s"): - i += 1 - try: seed = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-c"): - i += 1 - try: copies = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-h"): - print >> sys.stderr, usage; sys.exit(1) - else: - print >> sys.stderr, "Unknown command line argument: " + arg - sys.exit(1) - i += 1 - - # check that required arguments given - if (file_name == None): - print >> sys.stderr, usage; sys.exit(1) - - random.seed(seed) - - # read sequences - seqs = sequence.readFASTA(file_name,'Extended DNA') - - for s in seqs: - str = s.getString() - #FIXME altschul can't handle ambigs - name = s.getName() - - #print >> sys.stderr, ">%s" % name - - for i in range(copies): - - shuffledSeq = dinuclShuffle(str) - - if (copies == 1): - print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) - else: - print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq) - -if __name__ == '__main__': main()
--- a/mytools/fastamarkov.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="fastamarkov" name="background model"> - <description>of DNA sequence</description> - <command>cat $input | fasta-get-markov -m $m $norc > $output 2> err.txt - - </command> - <inputs> - <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/> - <param name="m" size="10" type="integer" value="0" label="Order of Markov model to use"/> - <param name="norc" label="Combine forward and reverse complement frequencies" type="boolean" truevalue="" falsevalue="-norc" checked="True"/> - </inputs> - <outputs> - <data format="txt" name="output" /> - </outputs> - <help> - -**What it does** - -This tool generates a markov model from a fasta sequence file. - - </help> -</tool>
--- a/mytools/fastashuffle1.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -<tool id="seqshuffle" name="shuffle sequences"> - <description>preserving mono-nucleotide frequency</description> - <command>cat $input | fasta-shuffle-letters > $output </command> - <inputs> - <param name="input" type="data" format="fasta" label="Original FASTA sequence file"/> - </inputs> - <outputs> - <data format="fasta" name="output" /> - </outputs> - <help> - -**Description** - -shuffle the position of nucleotides in each sequence, preserving the frequency of nucleotides in that sequence, but do not preserve di-nucleotide frequency. - - </help> -</tool>
--- a/mytools/fastashuffle2.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -<tool id="seqshuffle2" name="shuffle sequence"> - <description>preserving dinucleotide frequency</description> - <command interpreter="python">fasta-dinucleotide-shuffle.py -f $input -t $tag -c $n -s $seed > $output </command> - <inputs> - <param name="input" format="fasta" type="data" label="Original sequence file"/> - <param name="tag" type="text" size="40" value="-shuffled" label="tag added to shuffled sequence name"/> - <param name="n" type="integer" value="1" label="number of shuffled copies for each sequence"/> - <param name="seed" type="integer" value="1" label="random seed" help="the same seed gives the same random sequences"/> - </inputs> - <outputs> - <data format="fasta" name="output" /> - </outputs> - <help> - -**What it does** - -This tool shuffles the sequences in the input file but preserves the dinucleotide frequency of each sequence. - -The code implements the Altschul-Erikson dinucleotide shuffle algorithm, described in "Significance of nucleotide sequence alignments: A method for random sequence permutation that preserves dinucleotide and codon usage", S.F. Altschul and B.W. Erikson, Mol. Biol. Evol., 2(6):526--538, 1985. - -Code adapted from http://bioinformatics.bc.edu/clotelab/RNAdinucleotideShuffle/dinucleotideShuffle.html - - </help> -</tool>
--- a/mytools/fastqdump.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -<tool id="fastqdump" name="fastq-dump"> - <description>convert SRA to FASTQ</description> - <command>/Users/xuebing/tools/sratoolkit.2.1.9-mac32/fastq-dump -A $input -M $minReadLen -Z > $out_file1 </command> - <inputs> - <param name="input" format="sra" type="data" label="Original file (SRA)"/> - <param name="minReadLen" size="10" type="integer" value="10" label="minimum read length to output"/> - </inputs> - <outputs> - <data format="fastq" name="out_file1" /> - </outputs> - <help> - -**What it does** - -This is a wrapper of the fastq-dump tool from sra-toolkit. See http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software - - </help> -</tool>
--- a/mytools/fimo2.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -<tool id="fimo" name="motif search"> - <description>using FIMO</description> - <command> fimo - #if $background_select.bg_select == "fromfile": - -bgfile $bgfile - #end if - - $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database - && mv fimo_out/fimo.html ${html_outfile} - - && mv fimo_out/fimo.txt ${txt_outfile} - - && rm -rf fimo_out - - </command> - <inputs> - - <param name="motif" type="data" format="txt" label="Motif file" help="created using the tool create-motif-file, or import from Shared Data"/> - <param name="database" type="data" format="fasta" label="Sequence file (FASTA)"/> - - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background model" > - <option value="uniform" selected="true">uniform</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="txt" label="File for background model"/> - </when> - </conditional> - - <param name="pth" size="10" type="float" value="0.0001" label="p-value threshold"/> - <param name="norc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="-norc" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (txt)"/> - </outputs> - <help> - -**What it does** - -This tool uses FIMO to find matches of a motif in a fasta file. See more details: - -http://meme.sdsc.edu/meme/fimo-intro.html - - </help> -</tool>
--- a/mytools/fimo2bed.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -''' -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_39533592_39535592_- 1815 1823 12.032 4.26e-06 0.397 CAGGTAAGT -constitutive-donor mm9_chr1_59313750_59315750_+ 1889 1897 12.032 4.26e-06 0.397 CAGGTAAGT - -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_172019075_172021075_- 1947 1955 12.032 4.26e-06 0.843 CAGGTAAGT -constitutive-donor mm9_chr1_15300532_15302532_+ 156 164 12.032 4.26e-06 0.843 CAGGTAAGT -''' - -import sys - -def fimo2bed(filename,rc): - ''' - parse fimo output to make a bed file - rc: the sequence have been reverse complemented - ''' - f = open(filename) - header = f.readline() - for line in f: - pattern,posi,begin,stop,score,pv,qv,seq = line.strip().split('\t') - flds = posi.split('_') - start = flds[-3] - end = flds[-2] - strand = flds[-1] - chrom = '_'.join(flds[1:-3]) #'chrX_random' - if not rc: - if strand == '+': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - if strand == '-': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,'+']) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,'-']) - -fimo2bed(sys.argv[1],sys.argv[2]=='rc')
--- a/mytools/fimo2bed.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<tool id="fimo2bed" name="fimo-to-bed"> - <description>convert FIMO output to BED</description> - <command interpreter="python">fimo2bed.py $input $rc > $output</command> - <inputs> - <param name="input" format="txt" type="data" label="FIMO output file"/> - <param name="rc" label="Check if the sequences are reverse complement" type="boolean" truevalue="rc" falsevalue="none" checked="False"/> - </inputs> - <outputs> - <data format="bed" name="output" /> - </outputs> - <help> - - Only works if your original FIMO input fasta sequences have ids like:: - - mm9_chr15_99358448_99360448_- - - - </help> -</tool>
--- a/mytools/iupac2meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -<tool id="iupac2meme" name="create-motif-file"> - <description>from one sequence</description> - <command>iupac2meme - #if $background_select.bg_select == "fromfile": - -bg $bgfile - #end if - -numseqs $numseqs $logodds $motif > $output </command> - <inputs> - <param name="motif" size="20" type="text" value="AATAAA" label="motif sequence" help="IUPAC motif, such as ACGGWNYCGT"/> - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background model" > - <option value="uniform" selected="true">uniform</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="txt" label="File for background model"/> - </when> - </conditional> - - <param name="numseqs" size="10" type="integer" value="20" label="Number of sequences (-numseqs)" help="assume frequencies based on this many sequences; default: 20"/> - <param name="logodds" label="also output log-odds (PSSM)" help="output the log-odds (PSSM) and frequency (PSPM) motifs; default: PSPM motif only" type="boolean" truevalue="-logodds" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="txt" name="output" label="$motif-meme"/> - </outputs> - <help> - -**Description** - -Convert an IUPAC motif into a MEME version 4 formatted file suitable for use with FIMO and other MEME Suite programs. - -See additional information: - -http://meme.sdsc.edu/meme/doc/iupac2meme.html - -**IUPAC code**:: - - Nucleotide Code: Base: - ---------------- ----- - A.................Adenine - C.................Cytosine - G.................Guanine - T (or U)..........Thymine (or Uracil) - R.................A or G - Y.................C or T - S.................G or C - W.................A or T - K.................G or T - M.................A or C - B.................C or G or T - D.................A or G or T - H.................A or C or T - V.................A or C or G - N.................any base - - - - </help> -</tool>
--- a/mytools/match.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -<tool id="match" name="match"> - <description>find short motif occurrences</description> - <command> match $motif $seq $output $nmismatch $rc $bed > $log</command> - <inputs> - - <param name="motif" type="data" format="fasta" label="Search occurances of the following motifs" help="one or multiple sequences, FASTA format"/> - <param name="seq" type="data" format="fasta" label="in sequences" help="one or multiple sequences, FASTA format"/> - <param name="nmismatch" size="10" type="integer" value="0" label="number of mismatches allowed"/> - <param name="rc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="norc" falsevalue="rc" checked="False"/> - <param name="bed" label="output BED format" help="Default is tabular file (see example below). Please only check this if your sequence identifier looks like: hg18_chr6_122208322_122209078_+ (true for galaxy tool extracted sequences)" type="boolean" truevalue="bed" falsevalue="nobed" checked="False"/> - </inputs> - <outputs> - <data format="txt" name="log" label="${tool.name} on ${on_string} (log)"/> - <data format="bed" name="output" label="${tool.name} on ${on_string} (bed)"/> - </outputs> - <help> - -**What it does** - -This tool searches occurrences of a short nucleotide seuqences (allowing mismatches) in a set of longer sequences. - -Example motif file:: - - >motif1 - CAGGTAAGT - >motif2 - GTTTGGGGGCC - -Example sequence file:: - - >hg18_chr6_122208322_122209078_+ - CGTCGTAGCTACTAGCTACGTACGTACGTAGCTAGCATGCATGCTACGTA - CGTAGCTAGCTAAAAAAAAAAAAAAACTGCGGCTAGCTAGCTAGCTACGT - CGATCGTAGCTAC... - >hg18_chr6_1208322_122209023_+ - CGATGCTAGCTAGCTAGCTACGTAGCTAGCTAGTCGATGCTAGCTAGCTA - ATGCTAGCTAGC.... - -Output (bed):: - - chr11 72790893 72790902 ACTTAACTG 1 - antisense 5ss,G4T:CAGTTAAGT-rc hg18_chr11_72790846_72791902_+ 47 - chr11 72791880 72791889 CAGGTAAGA 1 + sense 5ss,T9A:CAGGTAAGA hg18_chr11_72790846_72791902_+ 1034 - - -Output (tab):: - - Tmod4 802 5ss:CAGGTAAGT-rc ACTTACCTG - Atp7b 77 5ss:CAGGTAAGT CAGGTAAGT - Fnta 665 5ss:CAGGTAAGT CAGGTAAGT - - - - </help> -</tool>
--- a/mytools/meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,349 +0,0 @@ -<tool id="meme_meme" name="MEME" version="1.0.0"> - <requirements><requirement type='package'>meme</requirement></requirements> - <description>motif discovery</description> - <command>meme "$input1" -o "${html_outfile.files_path}" - -nostatus - - ##-p 8 ##number of processors - - #if str( $options_type.options_type_selector ) == 'advanced': - -sf "${ str( $options_type.sf ).replace( ' ', '_' ) }" - -${options_type.alphabet_type.alphabet_type_selector} - -mod "${options_type.mod_type.mod_type_selector}" - -nmotifs "${options_type.nmotifs}" - -wnsites "${options_type.wnsites}" - -maxsize "${options_type.maxsize}" - - #if $options_type.evt < float('inf'): - -evt "${options_type.evt}" - #end if - - #if str( $options_type.mod_type.mod_type_selector ) != 'oops': - #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'nsites': - -nsites "${options_type.mod_type.motif_occurrence_type.nsites}" - #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'min_max_sites': - -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}" - #end if - #end if - - #if str( $options_type.motif_width_type.motif_width_type_selector ) == 'exact': - -w "${options_type.motif_width_type.width}" - #else - -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}" - #end if - - #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == 'nomatrim': - -nomatrim - #else - -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps} - #end if - - #if str( $options_type.bfile ) != 'None': - -bfile "${options_type.bfile}" - #end if - - #if str( $options_type.pspfile ) != 'None': - -psp "${options_type.pspfile}" - #end if - - #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna": - ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal} - #end if - - -maxiter "${options_type.maxiter}" -distance "${options_type.distance}" - - -prior "${options_type.alphabet_type.prior_type.prior_type_selector}" - #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != 'addone': - -b "${options_type.alphabet_type.prior_type.prior_b}" - #if str( $options_type.alphabet_type.prior_type.plib ) != 'None': - -plib "${options_type.alphabet_type.prior_type.plib}" - #end if - #end if - - #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == 'cons': - -cons "${options_type.alphabet_type.spmap_type.cons}" - #else - -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}" - -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" - #end if - - #if str( $options_type.branching_type.branching_type_selector ) == 'x_branch': - -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}" - #end if - - ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead - - #end if - - 2>&1 || echo "Error running MEME." - - - && mv ${html_outfile.files_path}/meme.html ${html_outfile} - - && mv ${html_outfile.files_path}/meme.txt ${txt_outfile} - - && mv ${html_outfile.files_path}/meme.xml ${xml_outfile} - - </command> - <inputs> - <param format="fasta" name="input1" type="data" label="Sequences"/> - - <conditional name="options_type"> - <param name="options_type_selector" type="select" label="Options Configuration"> - <option value="basic" selected="true">Basic</option> - <option value="advanced">Advanced</option> - </param> - <when value="basic"> - <!-- do nothing here --> - </when> - <when value="advanced"> - - <param name="sf" type="text" value="Galaxy FASTA Input" label="Name of sequence set" /> - - <conditional name="alphabet_type"> - <param name="alphabet_type_selector" type="select" label="Sequence Alphabet"> - <option value="protein">Protein</option> - <option value="dna" selected="true">DNA</option> - </param> - <when value="protein"> - <conditional name="prior_type"> - <param name="prior_type_selector" type="select" label="Choice of prior"> - <option value="dirichlet">simple Dirichlet prior</option> - <option value="dmix" selected="true">mixture of Dirichlets prior</option> - <option value="mega">extremely low variance dmix</option> - <option value="megap">mega for all but last iteration of EM; dmix on last iteration</option> - <option value="addone">add +1 to each observed count</option> - </param> - <when value="dirichlet"> - <param name="prior_b" type="float" value="0.01" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="dmix"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="mega"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="megap"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="addone"> - <!-- no values here? --> - </when> - </conditional> - <conditional name="spmap_type"> - <param name="spmap_type_selector" type="select" label="EM starting points"> - <option value="uni">uni</option> - <option value="pam" selected="true">pam</option> - <option value="cons">Use starting point from string</option> - </param> - <when value="uni"> - <param name="spfuzz" type="float" value="0.5" label="Fuzziness of the mapping" /> - </when> - <when value="pam"> - <param name="spfuzz" type="integer" value="120" label="Fuzziness of the mapping" /> - </when> - <when value="cons"> - <param name="cons" type="text" value="" label="Starting point from string" /> - </when> - </conditional> - </when> - <when value="dna"> - <param name="revcomp" label="Check reverse complement" type="boolean" truevalue="-revcomp" falsevalue="" checked="False"/> - <param name="pal" label="Check for palindromes" type="boolean" truevalue="-pal" falsevalue="" checked="False"/> - <conditional name="prior_type"> - <param name="prior_type_selector" type="select" label="Sequence Alphabet"> - <option value="dirichlet" selected="true">simple Dirichlet prior</option> - <option value="dmix">mixture of Dirichlets prior</option> - <option value="mega">extremely low variance dmix</option> - <option value="megap">mega for all but last iteration of EM; dmix on last iteration</option> - <option value="addone">add +1 to each observed count</option> - </param> - <when value="dirichlet"> - <param name="prior_b" type="float" value="0.01" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="dmix"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="mega"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="megap"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="addone"> - <!-- no values here? --> - </when> - </conditional> - <conditional name="spmap_type"> - <param name="spmap_type_selector" type="select" label="EM starting points"> - <option value="uni" selected="true">uni</option> - <option value="pam">pam</option> - <option value="cons">Use starting point from string</option> - </param> - <when value="uni"> - <param name="spfuzz" type="float" value="0.5" label="Fuzziness of the mapping" /> - </when> - <when value="pam"> - <param name="spfuzz" type="integer" value="120" label="Fuzziness of the mapping" /> - </when> - <when value="cons"> - <param name="cons" type="text" value="" label="Starting point from string" /> - </when> - </conditional> - </when> - </conditional> - - <param name="nmotifs" type="integer" value="1" label="Number of different motifs to search" /> - <param name="maxsize" type="integer" value="1000000" label="Max number of characters in the sequence file"/> - <param name="evt" type="float" value="inf" label="E-value to stop looking for motifs" /> - <conditional name="mod_type"> - <param name="mod_type_selector" type="select" label="Expected motif distribution"> - <option value="oops">One Occurrence Per Sequence</option> - <option value="zoops" selected="true">Zero or One Occurrence Per Sequence</option> - <option value="anr">Any Number of Repetitions</option> - </param> - <when value="oops"> - <!-- no values here --> - </when> - <when value="zoops"> - <conditional name="motif_occurrence_type"> - <param name="motif_occurrence_type_selector" type="select" label="Number of motif occurrences"> - <option value="default" selected="true">Use defaults</option> - <option value="nsites">nsites</option> - <option value="min_max_sites">min and max sites</option> - </param> - <when value="default"> - <!-- no values here --> - </when> - <when value="nsites"> - <param name="nsites" type="integer" value="1" label="Search nsites number of occurrences" /> - </when> - <when value="min_max_sites"> - <param name="minsites" type="integer" value="1" label="minsites" /> - <param name="maxsites" type="integer" value="50" label="maxsites" /> - </when> - </conditional> - </when> - <when value="anr"> - <conditional name="motif_occurrence_type"> - <param name="motif_occurrence_type_selector" type="select" label="Number of motif occurrences"> - <option value="default" selected="true">Use defaults</option> - <option value="nsites">nsites</option> - <option value="min_max_sites">min and max sites</option> - </param> - <when value="default"> - <!-- no values here --> - </when> - <when value="nsites"> - <param name="nsites" type="integer" value="1" label="Search nsites number of occurrences" /> - </when> - <when value="min_max_sites"> - <param name="minsites" type="integer" value="1" label="minsites" /> - <param name="maxsites" type="integer" value="50" label="maxsites" /> - </when> - </conditional> - </when> - </conditional> - <param name="wnsites" type="float" value="0.8" label="Weight on the prior on nsites" /> - - <conditional name="motif_width_type"> - <param name="motif_width_type_selector" type="select" label="Motif width type"> - <option value="exact">Exact width</option> - <option value="range" selected="true">Specify a range</option> - </param> - <when value="exact"> - <param name="width" type="integer" value="10" label="Width of motif to search" /> - </when> - <when value="range"> - <param name="minw" type="integer" value="8" label="Min width of motif to search" /> - <param name="maxw" type="integer" value="50" label="Max width of motif to search" /> - </when> - </conditional> - - <conditional name="motif_trim_type"> - <param name="motif_trim_type_selector" type="select" label="Motif trim type"> - <option value="nomatrim">No motif trim</option> - <option value="trim" selected="true">Trim motif</option> - </param> - <when value="nomatrim"> - <!-- no values here --> - </when> - <when value="trim"> - <param name="wg" type="integer" value="11" label="Gap cost" /> - <param name="ws" type="integer" value="1" label="Space cost" /> - <param name="noendgaps" label="Do not penalize endgaps" type="boolean" truevalue="-noendgaps" falsevalue="" checked="False"/> - </when> - </conditional> - - <param name="bfile" type="data" format="txt" optional="True" label="Background Model" /> - <param name="pspfile" type="data" format="txt" optional="True" label="Position-Specific Prior" /> - - <param name="maxiter" type="integer" value="50" label="Number of iterations of EM to run" /> - <param name="distance" type="float" value="0.001" label="Convergence criterion" /> - - <conditional name="branching_type"> - <param name="branching_type_selector" type="select" label="x-branching type"> - <option value="x_branch">Perform x-branching</option> - <option value="no_x_branch" selected="true">No x-branching</option> - </param> - <when value="no_x_branch"> - <!-- no values here --> - </when> - <when value="x_branch"> - <param name="bfactor" type="integer" value="3" label="Number of iterations of branching" /> - <param name="heapsize" type="integer" value="64" label="Maximum number of heaps to use" /> - </when> - </conditional> - - </when> - </conditional> - - <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False"> - <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator> - </param> - - </inputs> - <outputs> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (text)"/> - <data format="memexml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/> - </outputs> - <tests> - <test> - <param name="input1" value="meme/meme/meme_input_1.fasta" ftype="fasta" dbkey="hg19"/> - <param name="options_type_selector" value="basic"/> - <param name="non_commercial_use" value="True"/> - <output name="html_outfile" file="meme/meme/meme_output_html_1.html" lines_diff="12"/> - <output name="txt_outfile" file="meme/meme/meme_output_txt_1.txt" lines_diff="12"/> - <output name="xml_outfile" file="meme/meme/meme_output_xml_1.xml" lines_diff="8"/> - </test> - </tests> - <help> - -.. class:: warningmark - -**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.** - -If you want to specify sequence weights, you must include them at the top of your input FASTA file. - -.. class:: infomark - -**To cite MEME:** -Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. - - -For detailed information on MEME, click here_. To view the license_. - -.. _here: http://meme.nbcr.net/meme/meme-intro.html -.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html - - </help> -</tool>
--- a/mytools/memelogo.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -<tool id="memelogo" name="motif logo"> - <description>of MEME motif</description> - <command>ceqlogo -i $input -o tmp.eps -t $title -x '' - && ps2pdf -dEPSCrop tmp.eps $output - </command> - <inputs> - <param name="input" type="data" format="txt" label="MEME motif file"/> - <param name="title" type='text' size="50" label="Title" value="motif1"/> - </inputs> - <outputs> - <data format="pdf" name="output" /> - </outputs> - <help> - -**Description** - -Generate sequence logo for MEME motif file. See details here: - -http://meme.sdsc.edu/meme/doc/ceqlogo.html - -**Example output** - -.. image:: ./static/images/memelogo.png - - </help> -</tool>
--- a/mytools/revcompl.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -import sys - -compldna = {'A':'T', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -complrna = {'A':'U', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -def complement(seq,compl): - complseq = [compl[base] for base in seq] - return complseq - -def reverse_complement(seq,compl): - seq = list(seq) - seq.reverse() - return ''.join(complement(seq,compl)) - -def readFastaFile(infile,outfile,compl): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile,compl): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(reverse_complement(line.strip().upper(),compl)+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - rna = sys.argv[4] - if rna == 'rna': - compl = complrna - else: - compl = compldna - if fasta == 'fasta': - readFastaFile(seqfile,outfile,compl) - else: - readrawseq(seqfile,outfile,compl) - -main()
--- a/mytools/revcompl.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -<tool id="revcompl" name="reverse complement"> - <description>of DNA/RNA sequences</description> - <command interpreter="python">revcompl.py $input $output $fasta $rna </command> - <inputs> - <param name="input" format="txt" type="data" label="Original sequence file"/> - <param name="fasta" label="Check if input is fasta format" type="boolean" truevalue="fasta" falsevalue="txt" checked="False"/> - <param name="rna" label="Check if need to output as RNA sequences" type="boolean" truevalue="rna" falsevalue="dna" checked="False"/> - </inputs> - <outputs> - <data format="input" name="output" /> - </outputs> - <help> - -**What it does** - -This tool outputs reverse complementary of DNA/RNA sequences in the input file. The input can be fasta format or raw sequences (each line is a sequence). - -Degenerate nucleotides are supported. Here is the match table: - -A to T/U - -C to G - -G to C - -T/U to A - -M to K - -W to W - -S to S - -R to Y - -Y to R - -N to N - - </help> -</tool>
--- a/mytools/seq2meme.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -# -*- coding: iso-8859-1 -*- -import random,sys,math - -#import pylab - -def readMotifFile(filename): - - try: - f=open(filename) - except IOError: - print "could not open",filename,"Are you sure this file exists?" - sys.exit(1) - - seqs=[] - maxL = 0 - for line in f: - if '>' in line or 'N' in line: - next - else: - seq = line.strip().upper() - if maxL < len(seq): - maxL = len(seq) - seqs.append(seq) - f.close() - - print len(seqs),'sequences loaded' - print 'max seq length:',maxL - for i in range(len(seqs)): - if len(seqs[i]) < maxL: - del seqs[i] - print len(seqs),'sequences with length = ',maxL - return seqs - - -def createWeightMatrix(seqs,psuedocont): - - motifWidth = len(seqs[0]) - weightMatrix = [] - for i in range(motifWidth): - weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont}) - - #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif - #YOUR CODE HERE - for seq in seqs: - for pos in range(motifWidth): - weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0 - - #Normalize your weight matrix (so that it contains probabilities rather than counts) - #Remember the added psuedocounts when you normalize! - for pos in range(motifWidth): - totalCount = sum(weightMatrix[pos].values()) - for letter in weightMatrix[pos].keys(): - weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount - - #Return your weight matrix - return weightMatrix - -def printMemeFormat(weightMatrix,motifName,filename,nSite,background): - f = open(filename,'w') - f.write('MEME version 4.4\n\n') - - f.write('ALPHABET= ACGT\n\n') - - f.write('strands: + -\n\n') - - f.write('Background letter frequencies (from:\n') - f.write(background+'\n\n') - - f.write('MOTIF '+motifName+'\n\n') - - f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n') - for position in range(len(weightMatrix)): - probsThisPosition=weightMatrix[position] - f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n') - f.write('\n\n') - f.close() - -#get a two decimal-place string representation of a float f -def twoDecimal(f): - return "%.6f" %(f) - -def run(): - - #Get file name from command line - if len(sys.argv) < 3: - print "python seq2meme.py motif_fasta outputfile motifName psuedocont background" - sys.exit(1) - else: - motifFile=sys.argv[1] # - outFile=sys.argv[2] - motifName=sys.argv[3] - psuedocont = float(sys.argv[4]) - background=' '.join(sys.argv[5].strip().split(',')) - - motifs=readMotifFile(motifFile) - - #Create weight matrix - motifWeightMatrix=createWeightMatrix(motifs,psuedocont) - printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background) -run() -
--- a/mytools/seq2meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="seq2meme" name="create-motif-file"> - <description>from fasta file</description> - <command interpreter="python">seq2meme.py $input $output $motifName $psuedocont $background </command> - <inputs> - <param name="input" type="data" format="txt" label="Sequence file" help="all sequences should be the same length"/> - <param name="motifName" size="20" type="text" value="motif1" label="Motif name (no space allowed)"/> - <param name="psuedocont" size="10" type="float" value="1.0" label="Psuedocount"/> - <param name="background" size="40" type="text" value="A,0.25,C,0.25,G,0.25,T,0.25" label="Background frequency"/> - </inputs> - <outputs> - <data format="txt" name="output" label="$motifName-meme"/> - </outputs> - <help> - -**Description** - -Generate a MEME motif format file from a set of sequences. Input could be raw sequences (one sequence per line) or fasta format (one identifier line and one sequence line). - - - </help> -</tool>
--- a/mytools/seqshuffle.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -import sys - -from altschulEriksonDinuclShuffle import * - -def readFastaFile(infile,outfile): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(dinuclShuffle(line.strip().upper())+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - - if fasta == 'fasta': - readFastaFile(seqfile,outfile) - else: - readrawseq(seqfile,outfile) - -main()
--- a/mytools/sequence.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,720 +0,0 @@ -#!@WHICHPYTHON@ - -import copy, string, sys - -#------------------ Alphabet ------------------- - -class Alphabet(object): - """Biological alphabet class. - This defines the set of symbols from which various objects can be built, e.g. sequences and motifs. - The symbol set is immutable and accessed as a tuple. - symstr: symbols in alphabet as either a tuple or string - complement: dictionary defining letters and their complement - """ - def __init__(self, symstr, complement = None): - """Construct an alphabet from a string or tuple of characters. - Lower case characters will be converted to upper case. - An optional mapping for complements may be provided. - Example: - >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'}) - >>> alpha.getSymbols() - will construct the DNA alphabet and output: - ('A', 'C', 'G', 'T') - """ - symlst = [] - for s in [str(sym).upper()[0] for sym in symstr]: - if not s in symlst: - symlst.append(s) - self.symbols = tuple(symlst) - if complement != None: - # expand the mapping and check for contradictions - cmap = {} - for s in self.symbols: - c = complement.get(s, None) - if c != None: - if s in cmap and cmap[s] != c: - raise RuntimeError("Alphabet complement map " - "contains contradictory mapping") - cmap[s] = c - cmap[c] = s - # replace mapping with indicies - cimap = {} - for idx in range (len(self.symbols)): - s = self.symbols[idx] - if s in cmap: - cimap[cmap[s]] = idx - # create tuple - cidxlst = [] - for idx in range (len(self.symbols)): - cidxlst.append(cimap.get(self.symbols[idx], None)) - self.complements = tuple(cidxlst) - else: - self.complements = None - - def getSymbols(self): - """Retrieve a tuple with all symbols, immutable membership and order""" - return self.symbols - - def getComplements(self): - """Retrieve a tuple with all complement indicies, immutable""" - return self.complements - - def isValidSymbol(self, sym): - """Check if the symbol is a member of alphabet""" - return any([s==sym for s in self.symbols]) - - def getIndex(self, sym): - """Retrieve the index of the symbol (immutable)""" - for idx in range (len(self.symbols)): - if self.symbols[idx] == sym: - return idx - raise RuntimeError("Symbol " + sym + " does not exist in alphabet") - - def isComplementable(self): - return self.complements != None - - def getComplement(self, sym): - """Retrieve the complement of the symbol (immutable)""" - return self.symbols[self.complements[self.getIndex(sym)]]; - - def isValidString(self, symstr): - """Check if the string contains only symbols that belong to the alphabet""" - found = True - for sym in symstr: - if self.isValidSymbol(sym) == False: - return False - return True - - def getLen(self): - """Retrieve the number of symbols in (the length of) the alphabet""" - return len(self.symbols) - -# pre-defined alphabets that can be specified by their name -predefAlphabets = [ - ("DNA" , Alphabet('ACGT', {'A':'T', 'G':'C'})), - ("RNA" , Alphabet('ACGU')), - ("Extended DNA" , Alphabet('ACGTYRN')), - ("Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWY')), - ("Extended Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWYX')), - ("TM Labels" , Alphabet('MIO')) -] - -def getAlphabet(name): - """Retrieve a pre-defined alphabet by name. - Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available. - Example: - >>> alpha = sequence.getAlphabet('Protein') - >>> alpha.getSymbols() - will retrieve the 20 amino acid alphabet and output the tuple: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - for (xname, xalpha) in predefAlphabets: - if xname == name: - return xalpha - return None - -#------------------ Sequence ------------------- - -class Sequence(object): - """Biological sequence class. Sequence data is immutable. - - data: the sequence data as a tuple or string - alpha: the alphabet from which symbols are taken - name: the sequence name, if any - info: can contain additional sequence information apart from the name - """ - def __init__(self, sequence, alpha = None, name = "", seqinfo = ""): - """Create a sequence with sequence data. - Specifying the alphabet is optional, so is the name and info. - Example: - >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') - will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. - >>> myseq.getAlphabet().getSymbols() - will most likely output the standard protein alphabet: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - if type(sequence) is str: - self.data = tuple(sequence.upper()) - elif type(sequence) is tuple: - self.data = sequence - elif type(sequence) is list: - self.data = tuple([s.upper() for s in sequence]) - else: - raise RuntimeError("Sequence data is not specified correctly: must be string or tuple") - # Resolve choice of alphabet - validAlphabet = False - if alpha == None: # Alphabet is not set, attempt to set it automatically... - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, in order - if xalpha.isValidString( self.data ): # This alphabet works, go with it - self.alpha = alpha = xalpha - validAlphabet = True - break - self.name = name - self.info = seqinfo - if validAlphabet == False: # we were either unsuccessful above or the alphabet was specified so test it - if type(alpha) is str: # check if name is a predefined alphabet - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, check for name - if (xname == alpha): - alpha = xalpha - break - if type(alpha) is Alphabet: # the alphabet is specified - if alpha.isValidString(self.data) == False: - raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") - else: - self.alpha = alpha - else: - raise RuntimeError("Could not identify alphabet from sequence") - - #basic getters and setters for the class - def getName(self): - """Get the name of the sequence""" - return self.name - def getInfo(self): - """Get additional info of the sequence (e.g. from the defline in a FASTA file)""" - return self.info - def getAlphabet(self): - """Retrieve the alphabet that is assigned to this sequence""" - return self.alpha - def setName(self, name): - """Change the name of the sequence""" - self.name = name - def setAlphabet(self, alpha): - """Set the alphabet, throws an exception if it is not compatible with the sequence data""" - if type(alpha) is Alphabet: - if alpha.isValid( sequence ) == False: - raise RuntimeError( "Invalid alphabet specified" ) - #sequence functions - def getSequence(self): - """Retrieve the sequence data (a tuple of symbols)""" - return self.data - def getString(self): - """Retrieve the sequence data as a string (copy of actual data)""" - return "".join(self.data) - def getLen(self): - """Get the length of the sequence (number of symbols)""" - return len(self.data) - def getSite(self, position, length = 1): - """Retrieve a site in the sequence of desired length. - Note that positions go from 0 to length-1, and that if the requested site - extends beyond those the method throws an exception. - """ - if position >= 0 and position <= self.getLen() - length: - if length == 1: - return self.data[position] - else: - return self.data[position:position+length] - else: - raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() ) - - def nice(self): - """ A short description of the sequence """ - print self.getName(), ":", self.getLen() - -def readStrings(filename): - """ Read one or more lines of text from a file--for example an alignment. - Return as a list of strings. - filename: name of file - """ - txtlist = [] - f = open(filename) - for line in f.readlines(): - txtlist.extend(line.split()) - return txtlist - -def readFASTA(filename, alpha = None): - """ Read one or more sequences from a file in FASTA format. - filename: name of file to load sequences from - alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence) - """ - seqlist = [] - seqname = None - seqinfo = None - seqdata = [] - fh = open(filename) - thisline = fh.readline() - while (thisline): - if (thisline[0] == '>'): # new sequence - if (seqname): # take care of the data that is already in the buffer before processing the new sequence - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e - seqinfo = thisline[1:-1] # everything on the defline is "info" - seqname = seqinfo.split()[0] # up to first space - seqdata = [] - else: # pull out the sequence data - cleanline = thisline.split() - for line in cleanline: - seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files - thisline = fh.readline() - - if (seqname): - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e - else: - raise RuntimeError("No sequences on FASTA format found in this file") - fh.close() - return seqlist - -def _writeOneFASTA(sequence, filehandle): - """Write one sequence in FASTA format to an already open file""" - filehandle.write(">" + sequence.getName()+"\n") - data = sequence.getSequence() - lines = ( sequence.getLen() - 1) / 60 + 1 - for i in range(lines): - #note: python lets us get the last line (var length) free - #lineofseq = data[i*60 : (i+1)*60] + "\n" - lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n" - filehandle.write(lineofseq) - -def writeFASTA(sequence, filename): - """Write a list (or a single) of sequences to a file in the FASTA format""" - fh = open(filename, "w") - if isinstance(sequence, Sequence): - _writeOneFASTA(sequence, fh) - else: - for seq in sequence: - if isinstance(seq, Sequence): - _writeOneFASTA(seq, fh) - else: - print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)." - fh.flush() - fh.close() - -#------------------ Distrib ------------------- - -class Distrib(object): - """Class for storing a multinomial probability distribution over the symbols in an alphabet""" - def __init__(self, alpha, pseudo_count = 0.0): - self.alpha = alpha - self.tot = pseudo_count * self.alpha.getLen() - self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )] - - def __deepcopy__(self, memo): - dup = Distrib(self.alpha) - dup.tot = copy.deepcopy(self.tot, memo) - dup.cnt = copy.deepcopy(self.cnt, memo) - return dup - - def count(self, syms = None ): - """Count an observation of a symbol""" - if syms == None: - syms = self.alpha.getSymbols() - for sym in syms: - idx = self.alpha.getIndex( sym ) - self.cnt[idx] += 1.0 - self.tot += 1 - - def complement(self): - """Complement the counts, throw an error if this is impossible""" - if not self.alpha.isComplementable(): - raise RuntimeError("Attempt to complement a Distrib " - "based on a non-complementable alphabet.") - coms = self.alpha.getComplements() - new_count = [] - for idx in range(len(coms)): - cidx = coms[idx] - if cidx == None: - cidx = idx - new_count.append(self.cnt[cidx]) - self.cnt = new_count - return self - - def reset(self): - """Reset the distribution, that is, restart counting.""" - self.tot = 0 - self.cnt = [0.0 for _ in range( self.alpha.getLen() )] - - def getFreq(self, sym = None): - """Determine the probability distribution from the current counts. - The order in which probabilities are given follow the order of the symbols in the alphabet.""" - if self.tot > 0: - if sym == None: - freq = tuple([ y / self.tot for y in self.cnt ]) - return freq - else: - idx = self.alpha.getIndex( sym ) - return self.cnt[idx] / self.tot - return None - - def pretty(self): - """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)""" - table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])] - table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)])) - return table - - def getSymbols(self): - """Get the symbols in the alphabet in the same order as probabilities are given.""" - return self.alpha.getSymbols() - - def getAlphabet(self): - """Get the alphabet over which the distribution is defined.""" - return self.alpha - -#------------------ Motif (and subclasses) ------------------- - -class Motif(object): - """ Sequence motif class--defining a pattern that can be searched in sequences. - This class is not intended for direct use. Instead use and develop sub-classes (see below). - """ - def __init__(self, alpha): - self.len = 0 - self.alpha = alpha - - def getLen(self): - """Get the length of the motif""" - return self.len - - def getAlphabet(self): - """Get the alphabet that is used in the motif""" - return self.alpha - - def isAlphabet(self, seqstr): - """Check if the sequence can be processed by this motif""" - mystr = seqstr - if type(seqstr) is Sequence: - mystr = seqstr.getString() - return self.getAlphabet().isValidString(mystr) - -import re - -class RegExp(Motif): - """A motif class that defines the pattern in terms of a regular expression""" - def __init__(self, alpha, re_string): - Motif.__init__(self, alpha) - self.pattern = re.compile(re_string) - - def match(self, seq): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score), where score is - always 1.0 since a regular expression is either true or false (not returned). - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - mystr = myseq.getString() - if not Motif.isAlphabet(self, mystr): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for m in re.finditer(self.pattern, mystr): - yield (m.start(), m.group(), 1.0) - -import math, time - -# Variables used by the PWM for creating an EPS file -_colour_def = ( - "/black [0 0 0] def\n" - "/red [0.8 0 0] def\n" - "/green [0 0.5 0] def\n" - "/blue [0 0 0.8] def\n" - "/yellow [1 1 0] def\n" - "/purple [0.8 0 0.8] def\n" - "/magenta [1.0 0 1.0] def\n" - "/cyan [0 1.0 1.0] def\n" - "/pink [1.0 0.8 0.8] def\n" - "/turquoise [0.2 0.9 0.8] def\n" - "/orange [1 0.7 0] def\n" - "/lightred [0.8 0.56 0.56] def\n" - "/lightgreen [0.35 0.5 0.35] def\n" - "/lightblue [0.56 0.56 0.8] def\n" - "/lightyellow [1 1 0.71] def\n" - "/lightpurple [0.8 0.56 0.8] def\n" - "/lightmagenta [1.0 0.7 1.0] def\n" - "/lightcyan [0.7 1.0 1.0] def\n" - "/lightpink [1.0 0.9 0.9] def\n" - "/lightturquoise [0.81 0.9 0.89] def\n" - "/lightorange [1 0.91 0.7] def\n") -_colour_dict = ( - "/fullColourDict <<\n" - " (G) orange\n" - " (T) green\n" - " (C) blue\n" - " (A) red\n" - " (U) green\n" - ">> def\n" - "/mutedColourDict <<\n" - " (G) lightorange\n" - " (T) lightgreen\n" - " (C) lightblue\n" - " (A) lightred\n" - " (U) lightgreen\n" - ">> def\n" - "/colorDict fullColourDict def\n") - -_eps_defaults = { - 'LOGOTYPE': 'NA', - 'FONTSIZE': '12', - 'TITLEFONTSIZE': '12', - 'SMALLFONTSIZE': '6', - 'TOPMARGIN': '0.9', - 'BOTTOMMARGIN': '0.9', - 'YAXIS': 'true', - 'YAXISLABEL': 'bits', - 'XAXISLABEL': '', - 'TITLE': '', - 'ERRORBARFRACTION': '1.0', - 'SHOWINGBOX': 'false', - 'BARBITS': '2.0', - 'TICBITS': '1', - 'COLORDEF': _colour_def, - 'COLORDICT': _colour_dict, - 'SHOWENDS': 'false', - 'NUMBERING': 'true', - 'OUTLINE': 'false', -} -class PWM(Motif): - """This motif subclass defines a pattern in terms of a position weight matrix. - An alphabet must be provided. A pseudo-count to be added to each count is - optional. A uniform background distribution is used by default. - """ - def __init__(self, alpha): - Motif.__init__(self, alpha) # set alphabet of this multinomial distribution - self.background = Distrib(alpha) # the default background ... - self.background.count(alpha.getSymbols()) # ... is uniform - self.nsites = 0 - - def setFromAlignment(self, aligned, pseudo_count = 0.0): - """Set the probabilities in the PWM from an alignment. - The alignment is a list of equal-length strings (see readStrings), OR - a list of Sequence. - """ - self.cols = -1 - self.nsites = len(aligned) - seqs = [] - # Below we create a list of Sequence from the alignment, - # while doing some error checking, and figure out the number of columns - for s in aligned: - # probably a text string, so we make a nameless sequence from it - if not type(s) is Sequence: - s=Sequence(s, Motif.getAlphabet(self)) - else: - # it was a sequence, so we check that the alphabet in - # this motif will be able to process it - if not Motif.isAlphabet(self, s): - raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName()) - if self.cols == -1: - self.cols = s.getLen() - elif self.cols != s.getLen(): - raise RuntimeError("Sequences in alignment are not of equal length") - seqs.append(s) - # The line below initializes the list of Distrib (one for each column of the alignment) - self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)] - # Next, we do the counting, column by column - for c in range( self.cols ): # iterate through columns - for s in seqs: # iterate through rows - # determine the index of the symbol we find at this position (row, column c) - self.counts[c].count(s.getSite(c)) - # Update the length - self.len = self.cols - - def reverseComplement(self): - """Reverse complement the PWM""" - i = 0 - j = len(self.counts)-1 - while (i < j): - temp = self.counts[i]; - self.counts[i] = self.counts[j] - self.counts[j] = temp - self.counts[i].complement() - self.counts[j].complement() - i += 1; - j -= 1; - if i == j: - self.counts[i].complement() - return self - - def getNSites(self): - """Get the number of sites that made the PWM""" - return self.nsites - - def setBackground(self, distrib): - """Set the background distribution""" - if not distrib.getAlphabet() == Motif.getAlphabet(self): - raise RuntimeError("Incompatible alphabets") - self.background = distrib - - def getFreq(self, col = None, sym = None): - """Get the probabilities for all positions in the PWM (a list of Distribs)""" - if (col == None): - return [y.getFreq() for y in self.counts] - else: - return self.counts[col].getFreq(sym) - - def pretty(self): - """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)""" - #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])] - table = [] - for row in PWM.getFreq(self): - table.append("".join(["%8.6f " % y for y in row])) - return table - - def logoddsPretty(self, bkg): - """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)""" - table = [] - for row in PWM.getFreq(self): - #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))])) - table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))])) - #table.append("".join(["%8.6f " % row[i] for i in range(len(row))])) - return table - - - def consensus_sequence(self): - """ - Get the consensus sequence corresponding to a PWM. - Consensus sequence is the letter in each column - with the highest probability. - """ - consensus = "" - alphabet = Motif.getAlphabet(self).getSymbols() - for pos in range(self.cols): - best_letter = alphabet[0] - best_p = self.counts[pos].getFreq(best_letter) - for letter in alphabet[1:]: - p = self.counts[pos].getFreq(letter) - if p > best_p: - best_p = p - best_letter = letter - consensus += best_letter - return consensus - - - def consensus(self): - """ - Get the consensus corresponding to a PWM. - Consensus at each column of motif is a list of - characters with non-zero probabilities. - """ - consensus = [] - for pos in range(self.cols): - matches = [] - for letter in Motif.getAlphabet(self).getSymbols(): - p = self.counts[pos].getFreq(letter) - if p > 0: - matches += letter - consensus.append(matches) - return consensus - - - def getScore(self, seq, start): - """Score this particular list of symbols using the PFM (background needs to be set separately)""" - sum = 0.0 - seqdata = seq.getSequence()[start : start+self.cols] - for pos in range(len(seqdata)): - q = self.counts[pos].getFreq(seqdata[pos]) - if q == 0: - q = 0.0001 # to avoid log(0) == -Infinity - logodds = math.log(q / self.background.getFreq(seqdata[pos])) - sum += logodds - return sum - - def match(self, seq, _LOG0 = -10): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score). - The optional parameter _LOG0 specifies a lower bound on reported logodds scores. - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - if not Motif.isAlphabet(self, myseq): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for pos in range(myseq.getLen() - self.cols): - score = PWM.getScore(self, myseq, pos) - if score > _LOG0: - yield (pos, "".join(myseq.getSite(pos, self.cols)), score) - - def writeEPS(self, program, template_file, eps_fh, - timestamp = time.localtime()): - """Write out a DNA motif to EPS format.""" - small_dfmt = "%d.%m.%Y %H:%M" - full_dfmt = "%d.%m.%Y %H:%M:%S %Z" - small_date = time.strftime(small_dfmt, timestamp) - full_date = time.strftime(full_dfmt, timestamp) - points_per_cm = 72.0 / 2.54 - height = 4.5 - width = self.getLen() * 0.8 + 2 - width = min(30, width) - points_height = int(height * points_per_cm) - points_width = int(width * points_per_cm) - defaults = _eps_defaults.copy() - defaults['CREATOR'] = program - defaults['CREATIONDATE'] = full_date - defaults['LOGOHEIGHT'] = str(height) - defaults['LOGOWIDTH'] = str(width) - defaults['FINEPRINT'] = program + ' ' + small_date - defaults['CHARSPERLINE'] = str(self.getLen()) - defaults['BOUNDINGHEIGHT'] = str(points_height) - defaults['BOUNDINGWIDTH'] = str(points_width) - defaults['LOGOLINEHEIGHT'] = str(height) - with open(template_file, 'r') as template_fh: - m_var = re.compile("\{\$([A-Z]+)\}") - for line in template_fh: - last = 0 - match = m_var.search(line) - while (match): - if (last < match.start()): - prev = line[last:match.start()] - eps_fh.write(prev) - key = match.group(1) - if (key == "DATA"): - eps_fh.write("\nStartLine\n") - for pos in range(self.getLen()): - eps_fh.write("({0:d}) startstack\n".format(pos+1)) - stack = [] - # calculate the stack information content - alpha_ic = 2 - h = 0 - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - h -= (freq * math.log(freq, 2)) - stack_ic = alpha_ic - h - # calculate the heights of each symbol - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - stack.append((freq * stack_ic, sym)) - stack.sort(); - # output the symbols - for symh, sym in stack: - eps_fh.write(" {0:f} ({1:s}) numchar\n".format( - symh, sym)) - eps_fh.write("endstack\n\n") - eps_fh.write("EndLine\n") - elif (key in defaults): - eps_fh.write(defaults[key]) - else: - raise RuntimeError('Unknown variable "' + key + - '" in EPS template') - last = match.end(); - match = m_var.search(line, last) - if (last < len(line)): - eps_fh.write(line[last:]) - - -#------------------ Main method ------------------- -# Executed if you run this file from the operating system prompt, e.g. -# > python sequence.py - -if __name__=='__main__': - alpha = getAlphabet('Extended DNA') - #seqs = readFASTA('pos.fasta') - seqs = [] - aln = readStrings('tmp0') - #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') - pwm = PWM(alpha) - pwm.setFromAlignment(aln) - for row in pwm.pretty(): - print row - for s in seqs: - print s.getName(), s.getLen(), s.getAlphabet().getSymbols() - for m in regexp.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) - for m in pwm.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
--- a/mytools/splicesite.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -<tool id="splicesite" name="splice site score"> - <description>using max entropy model</description> - <command interpreter="perl">$script $input > $out_file1 </command> - <inputs> - <param name="input" format="txt" type="data" label="Sequence file" help="fasta or raw sequence (one sequence per line)"/> - <param name="script" type="select" label="Select model"> - <option value="splicesitescore/score5.pl" selected="true">5' splice site</option> - <option value="splicesitescore/score3.pl">3' splice site</option> - </param> - </inputs> - <outputs> - <data format="tabular" name="out_file1" /> - </outputs> - <help> - -**What it does** - -This tool computes splice site scores using a max entropy model. See more details here: - -http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html - ------ - -**Example input for 5' splice site sequence** - -3 exonic and 6 intronic nucleotides flanking the junction:: - - CTGGTGAGT - AAGGTACAG - -or fasta format:: - - >seq1 - CTGGTGAGT - >seq2 - AAGGTACAG - -Output:: - - CTGGTGAGT 10.10 - AAGGTACAG 8.04 - -or fasta format:: - - >seq1 CTGGTGAGT 10.10 - >seq2 AAGGTACAG 8.04 - - ------ - -**Example input for 3' splice site sequence** - -3 exonic and 20 intronic nucleotides flanking the junction:: - - CCTGCATCCTCTGTTCCCAGGTG - TTTCTTCCCTCCGGGAACAGTGG - -Output:: - - CCTGCATCCTCTGTTCCCAGGTG 10.47 - TTTCTTCCCTCCGGGAACAGTGG 6.22 - - </help> -</tool>