# HG changeset patch # User Richard Burhans # Date 1363015686 14400 # Node ID 95a05c1ef5d507e7b0af0089eb655d10640ac1e4 # Parent d6b961721037f1ddb900a86f4ab003a92635f80e update to devshed revision aaece207bd01 diff -r d6b961721037 -r 95a05c1ef5d5 README --- a/README Mon Nov 05 12:44:17 2012 -0500 +++ b/README Mon Mar 11 11:28:06 2013 -0400 @@ -5,9 +5,14 @@ matplotlib (we used version 1.1.0) http://pypi.python.org/packages/source/m/matplotlib/ mechanize (we used version 0.2.5) http://pypi.python.org/packages/source/m/mechanize/ networkx (we used version 1.6) http://pypi.python.org/packages/source/n/networkx/ + fisher (we used version 0.1.4) http://pypi.python.org/packages/source/f/fisher/ And the following software: ADMIXTURE (we used version 1.22) http://www.genetics.ucla.edu/software/admixture/ EIGENSOFT (we used version 3.0) http://genepath.med.harvard.edu/~reich/Software.htm PHAST (we used version 1.2.1) http://compgen.bscb.cornell.edu/phast/ QuickTree (we used version 1.1) http://www.sanger.ac.uk/resources/software/quicktree/ + +Images used in the tools' documentation are located in the static/images +directory. Please copy these to the static/images directory in your +Galaxy installation. diff -r d6b961721037 -r 95a05c1ef5d5 add_fst_column.xml --- a/add_fst_column.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/add_fst_column.xml Mon Mar 11 11:28:06 2013 -0400 @@ -15,8 +15,8 @@ - - + + @@ -33,9 +33,9 @@ - + - + diff -r d6b961721037 -r 95a05c1ef5d5 aggregate_gd_indivs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aggregate_gd_indivs.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import sys +import subprocess +from Population import Population + +################################################################################ + +if len(sys.argv) < 9: + print >> sys.stderr, "Usage" + sys.exit(1) + +#input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8] +#individual_metadata = sys.argv[8:] +input, p1_input, output, = sys.argv[1:4] +individual_metadata = sys.argv[4:] + +p_total = Population() +p_total.from_tag_list(individual_metadata) + +p1 = Population() +p1.from_population_file(p1_input) + +if not p_total.is_superset(p1): + print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' + sys.exit(1) + +################################################################################ + +prog = 'aggregate' + +args = [] +args.append(prog) +args.append(input) + +columns = p1.column_list() + +for column in sorted(columns): + args.append(column) + +fh = open(output, 'w') + +#print "args:", ' '.join(args) +p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) +rc = p.wait() +fh.close() + +sys.exit(0) + diff -r d6b961721037 -r 95a05c1ef5d5 aggregate_gd_indivs.xml --- a/aggregate_gd_indivs.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/aggregate_gd_indivs.xml Mon Mar 11 11:28:06 2013 -0400 @@ -2,7 +2,7 @@ : Append summary columns for a population - modify_snp_table.py "$input" "$p1_input" "$output" "-1" "-1" "-1" "-1" + aggregate_gd_indivs.py "$input" "$p1_input" "$output" #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) "$arg" diff -r d6b961721037 -r 95a05c1ef5d5 average_fst.xml --- a/average_fst.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/average_fst.xml Mon Mar 11 11:28:06 2013 -0400 @@ -1,4 +1,4 @@ - + : Estimate the relative fixation index between two populations @@ -21,8 +21,8 @@ - - + + @@ -95,10 +95,10 @@ The program prints the following measures of FST for the two populations. -1. The formulation by Sewall Wright (average over FSTs for all SNPs). -2. The Weir-Cockerham estimator (average over FSTs for all SNPs). -3. The Reich-Patterson estimator (average over FSTs for all SNPs). -4. The population-based Reich-Patterson estimator. +1. The Reich-Patterson estimator (average over FSTs for all SNPs). +2. The population-based Reich-Patterson estimator. +3. The formulation by Sewall Wright (average over FSTs for all SNPs). +4. The Weir-Cockerham estimator (average over FSTs for all SNPs). If randomizations were requested, it prints a summary for each of the four definitions of FST that includes the maximum and average value, and the highest-scoring population pair (if any scored higher than the two user-specified populations). @@ -123,10 +123,10 @@ - output:: Using 37847 SNPs, we compute: + Average Reich-Patterson FST is 0.31012. + The population-based Reich-Patterson Fst is 0.33625. Average Wright FST is 0.22810. Average Weir-Cockerham FST is 0.30813. - Average Reich-Patterson FST is 0.31012. - The population-based Reich-Patterson Fst is 0.33625. diff -r d6b961721037 -r 95a05c1ef5d5 calctfreq.py --- a/calctfreq.py Mon Nov 05 12:44:17 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# calcfreq.py -# -# Copyright 2011 Oscar Bedoya-Reina -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, -# MA 02110-1301, USA. - -import argparse,os,sys -from decimal import Decimal,getcontext -from LocationFile import LocationFile - -#method to rank the the pthways by mut. freq. -def rankd(ltfreqs): - ordvals=sorted(ltfreqs)#sort and reverse freqs. - #~ - outrnk=[] - tmpFreq0,tmpCount,tmpPthw=ordvals.pop()#the highest possible value - crank=1 - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),tmpPthw])) - totalnvals=len(ordvals) - cnt=0 - while totalnvals>cnt: - cnt+=1 - tmpFreq,tmpCount,tmpPthw=ordvals.pop() - if tmpFreq!=tmpFreq0: - crank=len(outrnk)+1 - tmpFreq0=tmpFreq - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),tmpPthw])) - return outrnk - - -def main(): - parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.') - parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') - parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2') - parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') - parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') - parser.add_argument('--loc_file',metavar='location file',type=str,help='location file') - parser.add_argument('--species',metavar='species',type=str,help='species') - #~Open arguments - class C(object): - pass - fulargs=C() - parser.parse_args(sys.argv[1:],namespace=fulargs) - #test input vars - inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn - locf,species=fulargs.loc_file,fulargs.species - #make a dictionary of valid genes - posKEGGclmn-=1 - Kgeneposcolmn-=1 - dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.strip()]) - sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) - while True:#to correct names with more than one gene - try: - mgenes=sdGenes.pop() - pthwsAssotd=dKEGGcPthws.pop(mgenes) - mgenes=mgenes.split('.') - for eachg in mgenes: - dKEGGcPthws[eachg]=pthwsAssotd - except: - break - #~ Count genes - getcontext().prec=2#set 2 decimal places - - location_file = LocationFile(locf) - prefix, kxml_dir_path, dict_file = location_file.get_values(species) - dPthContsTotls = {} - try: - with open(dict_file) as fh: - for line in fh: - line = line.rstrip('\r\n') - value, key = line.split('\t') - dPthContsTotls[key] = int(value) - except IOError, err: - print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror) - sys.exit(1) - - dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes - sdGenes=set([x for x in dKEGGcPthws.keys()])#list of all genes - cntGens=0 - ltGens=len(sdGenes) - while cntGens> sys.stderr, "Error: pathway not found in database: '{0}'".format(eachP) - sys.exit(1) - cntGens+=1 - #~ Calculate Freqs. - ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls] - tabllfreqs='\n'.join(rankd(ltfreqs)) - salef=open(outputf,'w') - salef.write(tabllfreqs) - salef.close() - return 0 - - -if __name__ == '__main__': - main() diff -r d6b961721037 -r 95a05c1ef5d5 commits.log --- a/commits.log Mon Nov 05 12:44:17 2012 -0500 +++ b/commits.log Mon Mar 11 11:28:06 2013 -0400 @@ -1,3 +1,17 @@ + +:ea75d4a4ded0 +cathy 2013-03-04 16:04 +- documented the new Restore Attributes tool, and tweaked its description and UI +- adjusted doc for the new Rank Terms tool, and tweaked its description and UI +- added doc paragraph for new ability of Filter SNPs to handle % thresholds +- tweaked description and doc for Rank Pathways and PCA +- bumped version numbers for Filter SNPs and Remarkable Intervals, due to + new functionality and bug fix, respectively + +:b63c3675e0a3 +cathy 2013-02-04 18:31 +Edited the README to instruct users to copy the files in static/images +into their Galaxy installation. :f556345a4185 cathy 2012-11-02 17:45 diff -r d6b961721037 -r 95a05c1ef5d5 cp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cp.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +import shutil +import sys + +if len(sys.argv) != 3: + print >> sys.stderr, 'Usage: %s ' % sys.argv[0] + sys.exit(1) + +src, dst = sys.argv[1:3] +shutil.copy2(src, dst) diff -r d6b961721037 -r 95a05c1ef5d5 datatypes_conf.xml --- a/datatypes_conf.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/datatypes_conf.xml Mon Mar 11 11:28:06 2013 -0400 @@ -8,6 +8,7 @@ + diff -r d6b961721037 -r 95a05c1ef5d5 filter_gd_snp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_gd_snp.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import sys +import subprocess +from Population import Population + +################################################################################ + +def convert_non_negative_int(string_value): + try: + val = int(string_value) + except: + print >> sys.stderr, '"%s" is not an integer' % string_value + sys.exit(1) + + if val < 0: + print >> sys.stderr, '"%d" is negative' % val + sys.exit(1) + + return val + + +def convert_percent(string_value): + if string_value.endswith('%'): + val = convert_non_negative_int(string_value[:-1]) + if val > 100: + print >> sys.stderr, 'percentage: "%d" > 100' % val + sys.exit(1) + val = val * -1 + else: + val = convert_non_negative_int(string_value) + + return str(val) + +################################################################################ + +if len(sys.argv) < 9: + print >> sys.stderr, "Usage" + sys.exit(1) + +input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8] +individual_metadata = sys.argv[8:] + +p_total = Population() +p_total.from_tag_list(individual_metadata) + +p1 = Population() +p1.from_population_file(p1_input) + +if not p_total.is_superset(p1): + print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' + sys.exit(1) + +lo = convert_percent(lo) +hi = convert_percent(hi) + +################################################################################ + +prog = 'filter_snps' + +args = [] +args.append(prog) +args.append(input) +args.append(lo) +args.append(hi) +args.append(lo_ind) +args.append(lo_ind_qual) + +columns = p1.column_list() + +for column in sorted(columns): + args.append(column) + +fh = open(output, 'w') + +#print "args:", ' '.join(args) +p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) +rc = p.wait() +fh.close() + +sys.exit(0) + diff -r d6b961721037 -r 95a05c1ef5d5 filter_gd_snp.xml --- a/filter_gd_snp.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/filter_gd_snp.xml Mon Mar 11 11:28:06 2013 -0400 @@ -1,8 +1,8 @@ - + : Discard some SNPs based on coverage or quality - modify_snp_table.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality" + filter_gd_snp.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality" #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) "$arg" @@ -12,8 +12,22 @@ - - + + + + + + + + + + + + + + + + @@ -55,6 +69,14 @@ for the population is too low or too high, or if their coverage or quality score for any individual in the population is too low. +The upper and lower bounds on total population coverage can be specified +either as read counts or as percentiles (e.g. "5%", with no decimal places). +For percentile bounds the SNPs are ranked by read count, so for example, a +lower bound of "10%" means that the least-covered 10% of the SNPs will be +discarded, while an upper bound of, say, "80%" will discard all SNPs above +the 80% mark, i.e. the top 20%. The threshold for the lower bound on +individual coverage can only be specified as a plain read count. + ----- **Example** diff -r d6b961721037 -r 95a05c1ef5d5 find_intervals.xml --- a/find_intervals.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/find_intervals.xml Mon Mar 11 11:28:06 2013 -0400 @@ -1,4 +1,4 @@ - + : Find high-scoring runs of SNPs diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/Fst_ave.c --- a/genome_diversity/src/Fst_ave.c Mon Nov 05 12:44:17 2012 -0500 +++ b/genome_diversity/src/Fst_ave.c Mon Mar 11 11:28:06 2013 -0400 @@ -234,11 +234,11 @@ pop_Fst(); printf("Using %d SNPs, we compute:\n", nsnp); - printf("Average Wright FST is %5.5f.\n", F0 = F_wright); - printf("Average Weir-Cockerham FST is %5.5f.\n", F1 = F_weir); printf("Average Reich-Patterson FST is %5.5f.\n", F2 = F_reich); printf("The population-based Reich-Patterson Fst is %5.5f.\n", F3 = N_reich/D_reich); + printf("Average Wright FST is %5.5f.\n", F0 = F_wright); + printf("Average Weir-Cockerham FST is %5.5f.\n", F1 = F_weir); if (nfail > 0) printf("WARNING: %d of %d FSTs could not be computed\n", nfail, 3*nsnp); diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/Makefile --- a/genome_diversity/src/Makefile Mon Nov 05 12:44:17 2012 -0500 +++ b/genome_diversity/src/Makefile Mon Mar 11 11:28:06 2013 -0400 @@ -4,8 +4,8 @@ CFLAGS = $(COPT) $(CWARN) INSTALL_DIR = ../bin -TARGETS = admix_prep coords2admix coverage dist_mat dpmix eval2pct \ - Fst_ave Fst_column pop sweep +TARGETS = admix_prep aggregate coords2admix coverage dist_mat dpmix \ + eval2pct filter_snps Fst_ave Fst_column get_pi sweep all: $(TARGETS) @@ -16,6 +16,9 @@ admix_prep: admix_prep.c lib.c $(CC) $(CFLAGS) $^ -o $@ +aggregate: aggregate.c lib.c + $(CC) $(CFLAGS) $^ -o $@ + coords2admix: coords2admix.c lib.c $(CC) $(CFLAGS) $^ -o $@ @@ -31,13 +34,16 @@ eval2pct: eval2pct.c lib.c $(CC) $(CFLAGS) $^ -o $@ +filter_snps: filter_snps.c lib.c + $(CC) $(CFLAGS) $^ -o $@ + Fst_ave: Fst_ave.c Fst_lib.c lib.c $(CC) $(CFLAGS) $^ -o $@ Fst_column: Fst_column.c Fst_lib.c lib.c $(CC) $(CFLAGS) $^ -o $@ -pop: pop.c lib.c +get_pi: get_pi.c lib.c $(CC) $(CFLAGS) $^ -o $@ sweep: sweep.c lib.c Huang.c diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/aggregate.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/aggregate.c Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,62 @@ +/* aggregate -- add four columns (allele counts, genotype, maximum quality) for +* a specified population to a Galaxy SNP table +* +* argv[1] = file containing a Galaxy table +* argv[2] ... are the starting columns (base-1) for the chosen individuals + +What it does on Galaxy +The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns) to the gd_snp table, analogous to the columns for an individual but containing summary data for the population as a group. These four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1). +*/ + +#include "lib.h" + +// most characters allowed in a row of the table +#define MOST 5000 + +// column for the relevant individuals/groups +int col[MOST]; +int nI; + +int main(int argc, char **argv) { + FILE *fp; + char *p, *z = "\t\n", buf[MOST], trash[MOST]; + int X[MOST], m, i, A, B, G, Q, g; + + if (argc < 3) + fatalf("args: SNP-table individual1 ..."); + + for (i = 2, nI = 0; i < argc; ++i, ++nI) + col[nI] = atoi(argv[i]); + + fp = ckopen(argv[1], "r"); + while (fgets(buf, MOST, fp)) { + if (buf[0] == '#') + continue; + strcpy(trash, buf); + // set X[i] = atoi(i-th word of s), i is base 0 + for (i = 1, p = strtok(trash, z); p != NULL; + ++i, p = strtok(NULL, z)) + X[i] = atoi(p); + for (i = A = B = Q = 0, G = -1; i < nI; ++i) { + m = col[i]; + A += X[m]; + B += X[m+1]; + g = X[m+2]; + if (g != -1) { + if (G == -1) // first time + G = g; + else if (G != g) + G = 1; + } + Q = MAX(Q, X[m+3]); + } + if (i < nI) // check bounds on the population's individuals + continue; + // add columns + if ((p = strchr(buf, '\n')) != NULL) + *p = '\0'; + printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); + } + + return 0; +} diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/filter_snps.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/filter_snps.c Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,132 @@ +/* pop -- add four columns (allele counts, genotype, maximum quality) for a +* specified population to a Galaxy SNP table, or enforce bounds +* +* argv[1] = file containing a Galaxy table +* argv[2] = lower bound on total coverage (> 0 means interpret as percentage) +* argv[3] = upper bound on total coverae (> 0 means interpret as percentage) +* argv[4] = lower bound on individual coverage +* argv[5] = lower bound on individual quality value +* argv[6] ... are the starting columns (base-1) for the chosen individuals + +What it does on Galaxy +The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. SNPs are then discarded if their total coverage for the population is too low or too high, or if their coverage or quality score for any individual in the population is too low. + +*/ + +#include "lib.h" + +// most characters allowed in a row of the table +#define MOST 5000 + +char buf[MOST]; + +// column for the relevant individuals/groups +int col[MOST], *T; +int nI, lo, hi, X[MOST]; + +void get_X() { + char *p, *z = "\t\n", trash[MOST]; + int i; + + strcpy(trash, buf); + // set X[i] = atoi(i-th word of s), i is base 0 + for (i = 1, p = strtok(trash, z); p != NULL; + ++i, p = strtok(NULL, z)) + X[i] = atoi(p); +} + +int compar(int *a, int *b) { + if (*a < *b) + return -1; + if (*a > *b) + return 1; + return 0; +} + +void find_lo(char *filename) { + FILE *fp = ckopen(filename, "r"); + int n, m, i, k; + + for (n = 0; fgets(buf, MOST, fp); ++n) + ; + T = ckalloc(n*sizeof(int)); + rewind(fp); + for (k = 0; fgets(buf, MOST, fp); ++k) { + get_X(); + for (i = T[k] = 0; i < nI; ++i) { + m = col[i]; + T[k] += (X[m]+X[m+1]); + } + } + qsort((void *)T, (size_t)n, sizeof(int), (const void *)compar); + if (lo < 0) { + lo = -lo; + if (lo > 100) + fatal("cannot have low > 100%"); + lo = T[(n*lo)/100]; + } + if (hi < 0) { + hi = -hi; + if (hi > 100) + fatal("cannot have high > 100%"); + hi = T[(n*hi)/100]; + } + free(T); + fclose(fp); +} + +int main(int argc, char **argv) { + FILE *fp; + char *p; + int m, i, A, B, G, Q, indiv, qual, g, q; + + if (argc < 3) + fatalf("args: SNP-table low high col1 col2 ..."); + + for (i = 6, nI = 0; i < argc; ++i, ++nI) + col[nI] = atoi(argv[i]); + lo = atoi(argv[2]); + hi = atoi(argv[3]); + if (hi == 0) + hi = 100000000; + if (lo < 0 || hi < 0) + find_lo(argv[1]); + indiv = atoi(argv[4]); + qual = atoi(argv[5]); + if (indiv < 0 || qual < 0) + fatalf("percentiles not implemented for individuals"); + + fp = ckopen(argv[1], "r"); + while (fgets(buf, MOST, fp)) { + if (buf[0] == '#') + continue; + get_X(); + for (i = A = B = Q = 0, G = -1; i < nI; ++i) { + m = col[i]; + if (X[m]+X[m+1] < indiv || (q = X[m+3]) < qual) + break; + A += X[m]; + B += X[m+1]; + g = X[m+2]; + if (g != -1) { + if (G == -1) // first time + G = g; + else if (G != g) + G = 1; + } + Q = MAX(Q, q); + } + if (i < nI) // check bounds on the population's individuals + continue; + if (lo == -1 && hi == -1 && indiv == -1 && qual == -1) { + // add columns + if ((p = strchr(buf, '\n')) != NULL) + *p = '\0'; + printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); + } else if (A+B >= lo && (hi == -1 || A+B <= hi)) + // coverage meets the population-level restrictions + printf("%s", buf); + } + + return 0; +} diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/get_pi.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/get_pi.c Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,282 @@ +/* get_pi -- compute piNon, piSyn, thetaNon and thetaSyn +* +* argv[1] -- SAPs file +* argv[2] -- SNPs file +* argv[3] -- covered intervals file +* argv[4], argv[5], ... -- starting columns in the SNP file for the chosen +* individuals +* +* We assume that lines of argv[1], argv[2] and argv[3] start with a scaffold +* name and a scaffold position, and that they are sorted on those two fields. +* The 4th entry in an interval line gives the reference chromosome. We ignore +* unnumbered chromosome, e.g., "chrX". +* +* Output: +* the number of nonsyn SNPs, total number of nonsynon sites, piNon, +* the number of synon SNPs, total number of synon sites, piSyn, plus +* total length of covered intervals, thetaNon, thetaSyn +* +* What it does on Galaxy +The tool computes values that estimate some basic parameters +*/ + +#include "lib.h" + +// END_FILE should be larger than any real scaffold number +#define END_FILE 1000000000 // scaffold "number" signifying end-of-file +#define BUF_SIZE 10000 // maximum length of a SNP-file row + +int col[10000], nC; // columns containing the chosen genotypes + +// scaffold numbers and positions of the current SAP, SNP, and interval +int nbr_SAP, nbr_SNP, nbr_interv, pos_SAP, pos_SNP, beg, end, columns, debug; + +// current SNP row, the variant amino acids of the SAP, interval's reference chr +char snp[BUF_SIZE], A[100], B[100], chr[100]; + +// number of nonsynon and snynon sites in the current interval +float all_non, all_syn; + +// return the number of chromosome pairs that differ at a SNP +int diff_pair() { + int i, j, X[1000]; + char *p, *z = "\t\n"; + + // set X[i] = atoi(i-th word of SNP-file row), base 1 + for (i = 1, p = strtok(snp, z); p != NULL; + ++i, p = strtok(NULL, z)) + X[i] = atoi(p); + // add genotypes to count the reference allele + for (j = i = 0; i < nC; ++i) + j += X[col[i]]; + // of the 2*nC chromosomes, j have the reference nucleotide + if (debug) + printf("get_pi: j = %d, return %d\n", j, j*(2*nC-j)); + return j*(2*nC-j); +} + +// translate e.g. "scaffold234" to the integer 234 +int name2nbr(char *s) { + if (same_string(s, "chrX")) + return 1000; + if (same_string(s, "chrY")) + return 1001; + while (isalpha(*s)) + ++s; + return atoi(s); +} + +// does one scaffold-position pair strictly precede another +int before(int nbra, int posa, int nbrb, int posb) { + return (nbra < nbrb || (nbra == nbrb && posa < posb)); +} + +// get a SAP; set A and B; set nbr_SAP = END_FILE for end-of-file +void get_SAP(FILE *fp) { + char buf[500], scaf_name[100]; + int old_nbr = nbr_SAP, old_pos = pos_SAP; + + if (nbr_SAP >= END_FILE) + return; + if (!fgets(buf, 500, fp)) { + nbr_SAP = END_FILE; + return; + } + while (buf[0] == '#') + if (!fgets(buf, 500, fp)) { + nbr_SAP = END_FILE; + return; + } + if (columns == 8) { + if (sscanf(buf, "%s %d %*s %*s %*s %*s %s %*s %s", + scaf_name, &pos_SAP, A, B) != 4) + fatalf("bad SAP : %s", buf); + } else if (columns == 5) { + if (sscanf(buf, "%s %d %*s %*s %s %*s %s", + scaf_name, &pos_SAP, A, B) != 4) + fatalf("bad SAP : %s", buf); + } else + fatalf("get_SAP: columns = %d", columns); + nbr_SAP = name2nbr(scaf_name); + if (before(nbr_SAP, pos_SAP, old_nbr, old_pos)) + fatalf("SAP at scaf%d %d is out of order", nbr_SAP, pos_SAP); + if (debug) + printf("SAP: scaf%d %d\n", nbr_SAP, pos_SAP); +} + +// get a SNP +void get_SNP(FILE *fp) { + char scaf_name[100]; + int old_nbr = nbr_SNP, old_pos = pos_SNP; + + if (nbr_SNP >= END_FILE) + return; + if (!fgets(snp, BUF_SIZE, fp)) { + nbr_SNP = END_FILE+1; + return; + } + while (snp[0] == '#') + if (!fgets(snp, 500, fp)) { + nbr_SNP = END_FILE+1; + return; + } + if (sscanf(snp, "%s %d", scaf_name, &pos_SNP) != 2) + fatalf("bad SNP : %s", snp); + nbr_SNP = name2nbr(scaf_name); + if (before(nbr_SNP, pos_SNP, old_nbr, old_pos)) { + fprintf(stderr, "seq%d %d before seq%d %d\n", + nbr_SNP, pos_SNP, old_nbr, old_pos); + fatalf("SNP at sequence %d %d is out of order", nbr_SNP, pos_SNP); + } + if (debug) + printf("SNP: scaf%d %d\n", nbr_SNP, pos_SNP); +} + +// expand fractions .333 and .666 to full double-precision accuracy +double grow(float x) { + int chop = x; + float remain; + double d, third = (double)1/(double)3; + + d = (double)chop; + remain = x - (float)chop; + if (0.1 < remain) + d += third; + if (0.5 < remain) + d += third; + return d; +} + +// read an interval; update tot_non and tot_syn +int get_interval(FILE *fp) { + char buf[500], scaf_name[100], tmp[500], *t, *z = " \t\n"; + int old_nbr = nbr_interv, old_end = end; + + if (!fgets(buf, 500, fp)) + return 0; + while (buf[0] == '#') + if (!fgets(buf, 500, fp)) + return 0; + if (columns == 0) { + strcpy(tmp, buf); + for (columns = 0, t = strtok(tmp, z); t != NULL; + ++columns, t = strtok(NULL, z)) + ; + } + if (columns != 5 && columns != 8) + fatalf("interval table has %d columns", columns); + if (columns == 8 && sscanf(buf, "%s %d %d %s %*s %*s %f %f", + scaf_name, &beg, &end, chr, &all_non, &all_syn) != 6) + fatalf("bad interval : %s", buf); + if (columns == 5) { + if (sscanf(buf, "%s %d %d %f %f", + scaf_name, &beg, &end, &all_non, &all_syn) != 5) + fatalf("bad interval : %s", buf); + strcpy(chr, scaf_name); + } + nbr_interv = name2nbr(scaf_name); + if (before(nbr_interv, beg, old_nbr, old_end)) + fatalf("interval at %s %d is out of order", scaf_name, beg); + if (debug) + printf("int: scaf%d %d-%d\n", nbr_interv, beg, end); + + return 1; +} + +int main(int argc, char **argv) { + FILE *fp1, *fp2, *fp3; + int i, nint, nsap, no_sap, no_snp, no_chr, nsyn, nnon, d, tot_len; + float non, syn, x; + double tot_non = 0.0, tot_syn = 0.0, // total sites in the intervals + factor; + + // process command-line arguments + if (same_string(argv[argc-1], "debug")) { + debug = 1; + --argc; + } + if (argc < 5) + fatal("args: SAPs SNPs intervals individual1 ... [debug]"); + fp1 = ckopen(argv[1], "r"); + fp2 = ckopen(argv[2], "r"); + fp3 = ckopen(argv[3], "r"); + for (i = 4; i < argc; ++i) + col[i-4] = atoi(argv[i]) + 2; + nC = argc - 4; + + // loop over the intervals + tot_len = no_sap = nsap = no_snp = no_chr = nsyn = nnon = 0; + non = syn = 0.0; + for (nint = 0; get_interval(fp3); ++nint) { + if (strncmp(chr, "chr", 3) == 0 && !isdigit(chr[3])) { + ++no_chr; + continue; + } + tot_len += (end - beg); + // expand e.g. .333 to .3333333.. + tot_non += grow(all_non); + tot_syn += grow(all_syn); + + // skip SAPs coming before this interval + while (before(nbr_SAP, pos_SAP, nbr_interv, beg)) + get_SAP(fp1); + // loop over SAPs in this inteval + while (before(nbr_SAP, pos_SAP, nbr_interv, end)) { + ++nsap; + + // look for this SAP in the SNP file + while (before(nbr_SNP, pos_SNP, nbr_SAP, pos_SAP)) { + if (nbr_SNP == nbr_interv && pos_SNP >= beg) + ++no_sap; + get_SNP(fp2); + } + + // is this the SAP we looked for? + if (nbr_SAP == nbr_SNP && pos_SAP == pos_SNP) { + d = diff_pair(); + if (A[0] == B[0]) { + ++nsyn; + syn += (float)d; + } else { + ++nnon; + non += (float)d; + } + get_SNP(fp2); + } else + ++no_snp; + get_SAP(fp1); + } + // process SNPs in the interval but not in the SAP file + while (before(nbr_SNP, pos_SNP, nbr_interv, end)) { + if (nbr_SNP == nbr_interv && pos_SNP >= beg) + ++no_sap; + get_SNP(fp2); + } + } + + // there are x = (2*nC choose 2) pairs of chromosomes + x = (float)(nC*(2*nC-1)); + non /= x; + syn /= x; + printf("%d intervals\n", nint); + if (no_chr) + printf("ignored %d interval%s on unnumbered chromosomes, like chrX\n", + no_chr, no_chr == 1 ? "" : "s"); + printf("%d SNPs, %d nonsyn, %d synon\n", nsap, nnon, nsyn); + if (no_sap) + printf("%d SNPs in an interval are not in the SAP table\n", + no_sap); + if (no_snp) + printf("%d SNPs in an interval are not in the SNP table\n", + no_snp); + printf("nonsynon: %4.3f/%4.3f = %6.5f%%\n", + non, tot_non, 100.0*non/tot_non); + printf("synon: %4.3f/%4.3f = %6.5f%%\n", + syn, tot_syn, 100.0*syn/tot_syn); + for (factor = 0.0, i = 1; i < 2*nC; ++i) + factor += (1.0/(double)i); + factor *= (double)tot_len/100.0; + printf("%d covered bp, thetaNon = %6.5f%%, thetaSyn = %6.5f%%\n", + tot_len, (double)nnon/factor, (double)nsyn/factor); + return 0; +} diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/pop.c --- a/genome_diversity/src/pop.c Mon Nov 05 12:44:17 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -/* pop -- add four columns (allele counts, genotype, maximum quality) for a -* specified population to a Galaxy SNP table, or enforce bounds -* -* argv[1] = file containing a Galaxy table -* argv[2] = lower bound on total coverage (-1 = no lower bound) -* argv[3] = upper bound on total coverae (-1 if no bound) -* argv[4] = lower bound on individual coverage (-1 = no bound) -* argv[5] = lower bound on individual quality value (-1 = no bound) -* argv[6] ... are the starting columns (base-1) for the chosen individuals - -What it does on Galaxy -The user specifies that some of the individuals in the selected SNP table are form a "population" that has been previously defined using the Galaxy tool to select individuals from a SNP table. One option is for the program to append four columns to the table, giving the total counts for the two alleles, the "genotype" for the population and the maximum quality value, taken over all indivuals in the population. If all defined genotypes in the population are 2 (agree with the reference), the population's genotype is 2; similarly for 0; otherwise the genoype is 1 (unless all individuals have undefined genotype, in which case it is -1. The other option is to remove rows from the table for which the total coverage for the population is either too low or too high, and/or if the individual coverage or quality value is too low. -*/ - -#include "lib.h" - -// most characters allowed in a row of the table -#define MOST 50000 - -// column for the relevant individuals/groups -int col[MOST]; -int nI; - -int main(int argc, char **argv) { - FILE *fp; - char *p, *z = "\t\n", buf[MOST], trash[MOST]; - int X[MOST], m, i, A, B, G, Q, lo, hi, indiv, qual, g, q; - - if (argc < 3) - fatalf("args: SNP-table low high col1 col2 ..."); - - lo = atoi(argv[2]); - hi = atoi(argv[3]); - indiv = atoi(argv[4]); - qual = atoi(argv[5]); - for (i = 6, nI = 0; i < argc; ++i, ++nI) - col[nI] = atoi(argv[i]); - - fp = ckopen(argv[1], "r"); - while (fgets(buf, MOST, fp)) { - if (buf[0] == '#') - continue; - strcpy(trash, buf); - // set X[i] = atoi(i-th word of s), i is base 0 - for (i = 1, p = strtok(trash, z); p != NULL; - ++i, p = strtok(NULL, z)) - X[i] = atoi(p); - for (i = A = B = Q = 0, G = -1; i < nI; ++i) { - m = col[i]; - if (X[m]+X[m+1] < indiv || (q = X[m+3]) < qual) - break; - A += X[m]; - B += X[m+1]; - g = X[m+2]; - if (g != -1) { - if (G == -1) // first time - G = g; - else if (G != g) - G = 1; - } - Q = MAX(Q, q); - } - if (i < nI) // check bounds on the population's individuals - continue; - if (lo == -1 && hi == -1 && indiv == -1 && qual == -1) { - // add columns - if ((p = strchr(buf, '\n')) != NULL) - *p = '\0'; - printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); - } else if (A+B >= lo && (hi == -1 || A+B <= hi)) - // coverage meets the population-level restrictions - printf("%s", buf); - } - - return 0; -} diff -r d6b961721037 -r 95a05c1ef5d5 genome_diversity/src/sweep.c --- a/genome_diversity/src/sweep.c Mon Nov 05 12:44:17 2012 -0500 +++ b/genome_diversity/src/sweep.c Mon Mar 11 11:28:06 2013 -0400 @@ -1,8 +1,8 @@ /* sweep -- find regions of the genome with high scores (e.g., Fst scores). * * argv[1] -- file containing a Galaxy table -* argv[2] -- column number (base-1) for the chromosome name -* argv[3] -- column number for the (base-0) chromosomal position +* argv[2] -- column number for the chromosome name (column numbers base-1) +* argv[3] -- column number for the chromosomal position * argv[4] -- column number for a score for the position * argv[5] -- a percentage, such as "95", or a raw score, such as "=0.9". * argv[6] -- the number of randomizations (shuffles) of the scores @@ -112,6 +112,8 @@ fatal("cannot happen"); strcpy(chr, get_col(buf, chr_col)); S[0].pos = atoi(get_col(buf, pos_col)); + if (S[0].pos < 0) + fatalf("remove unmapped SNPs (address = -1)"); S[0].x = atof(get_col(buf, score_col)); for (nS = 1; ; ++nS) { if (!fgets(buf, BUF_SIZE, fp)) { @@ -121,6 +123,8 @@ if (!same_string(chr, get_col(buf, chr_col))) break; S[nS].pos = atoi(get_col(buf, pos_col)); + if (S[nS].pos < 0) + fatalf("remove unmapped SNPs (address = -1)"); S[nS].x = atof(get_col(buf, score_col)); } return 1; @@ -183,7 +187,7 @@ struct snp *s; if (argc != 7 && argc != 8) - fatal("args: table chr_col pos_col score_col threhold randomizations [SNPs]"); + fatal("args: table chr_col pos_col score_col threshold randomizations [SNPs]"); // process command-line arguments chr_col = atoi(argv[2]); diff -r d6b961721037 -r 95a05c1ef5d5 modify_snp_table.py --- a/modify_snp_table.py Mon Nov 05 12:44:17 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -#!/usr/bin/env python - -import sys -import subprocess -from Population import Population - -################################################################################ - -if len(sys.argv) < 9: - print >> sys.stderr, "Usage" - sys.exit(1) - -input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8] -individual_metadata = sys.argv[8:] - -p_total = Population() -p_total.from_tag_list(individual_metadata) - -p1 = Population() -p1.from_population_file(p1_input) - -if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) - -################################################################################ - -prog = 'pop' - -args = [] -args.append(prog) -args.append(input) -args.append(lo) -args.append(hi) -args.append(lo_ind) -args.append(lo_ind_qual) - -columns = p1.column_list() - -for column in sorted(columns): - args.append(column) - -fh = open(output, 'w') - -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() - -sys.exit(0) - diff -r d6b961721037 -r 95a05c1ef5d5 nucleotide_diversity_pi.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nucleotide_diversity_pi.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import sys +import subprocess +from Population import Population + +################################################################################ + +def run_program(prog, args, stdout_file=None, space_to_tab=False): + #print "args: ", ' '.join(args) + p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = p.communicate() + rc = p.returncode + + if stdout_file is not None: + with open(stdout_file, 'w') as ofh: + lines = stdoutdata.split('\n') + for line in lines: + line = line.strip() + if line: + if space_to_tab: + line = line.replace(' ', '\t') + print >> ofh, line + + if rc != 0: + print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) + print >> sys.stderr, stderrdata + sys.exit(1) + +################################################################################ + +if len(sys.argv) < 8: + print >> sys.stderr, "Usage" + sys.exit(1) + +gd_saps_file, gd_snps_file, covered_intervals_file, gd_indivs_file, output_file = sys.argv[1:6] +individual_metadata = sys.argv[6:] + +p_total = Population() +p_total.from_tag_list(individual_metadata) + +p1 = Population() +p1.from_population_file(gd_indivs_file) +if not p_total.is_superset(p1): + print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' + sys.exit(1) + +################################################################################ + +prog = 'get_pi' + +args = [ prog ] +args.append(gd_saps_file) +args.append(gd_snps_file) +args.append(covered_intervals_file) + +columns = p1.column_list() +for column in columns: + args.append('{0}'.format(column)) + +run_program(None, args, stdout_file=output_file) + +sys.exit(0) + diff -r d6b961721037 -r 95a05c1ef5d5 nucleotide_diversity_pi.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nucleotide_diversity_pi.xml Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,34 @@ + + : &pi; and &theta; + + + nucleotide_diversity_pi.py "$saps" "$snps" "$intervals" "$indivs" "$output" + #for $individual_name, $individual_col in zip($snps.dataset.metadata.individual_names, $snps.dataset.metadata.individual_columns) + #set $arg = '%s:%s' % ($individual_col, $individual_name) + "$arg" + #end for + + + + + + + + + + + + + + +**What it does** + +This tool computes values that estimate some basic parameters. + +**Output** + +the number of nonsyn SNPs, total number of nonsynon sites, piNon, +the number of synon SNPs, total number of synon sites, piSyn, plus +total length of covered intervals, thetaNon, thetaSyn + + diff -r d6b961721037 -r 95a05c1ef5d5 pca.xml --- a/pca.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/pca.xml Mon Mar 11 11:28:06 2013 -0400 @@ -1,5 +1,5 @@ - : Principal Component Analysis of genotype data + : Principal Components Analysis of genotype data pca.py "$input" "$input.extra_files_path" "$output" "$output.files_path" @@ -56,7 +56,7 @@ The user selects a gd_ped dataset generated by the Prepare Input tool. The PCA tool runs a -Principal Component Analysis on the input genotype data and constructs +Principal Components Analysis on the input genotype data and constructs a plot of the top two principal components. It also reports the following estimates of the statistical significance of the analysis. diff -r d6b961721037 -r 95a05c1ef5d5 rank_pathways.xml --- a/rank_pathways.xml Mon Nov 05 12:44:17 2012 -0500 +++ b/rank_pathways.xml Mon Mar 11 11:28:06 2013 -0400 @@ -1,9 +1,9 @@ - - : Assess the impact of gene sets on pathways + + : Assess the impact of a gene set on KEGG pathways #if str($output_format) == 'a' - calctfreq.py + rank_pathways_pct.py #else if str($output_format) == 'b' calclenchange.py #end if @@ -71,7 +71,8 @@ 1. number of genes in the pathway present in the input dataset 2. percentage of the total genes in the pathway included in the input dataset 3. rank of the frequency (from high freq to low freq) -4. name of the pathway +4. Fisher probability of enrichment/depletion of pathway genes in the input dataset +5. name of the pathway If pathways are ranked by change in length and number of paths, the output is a tabular dataset with the following columns: diff -r d6b961721037 -r 95a05c1ef5d5 rank_pathways_pct.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rank_pathways_pct.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,138 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# calcfreq.py +# +# Copyright 2011 Oscar Bedoya-Reina +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse,os,sys +from decimal import Decimal,getcontext +from LocationFile import LocationFile +from fisher import pvalue + +#method to rank the the pthways by mut. freq. +def rankd(ltfreqs): + ordvals=sorted(ltfreqs)#sort and reverse freqs. + #~ + outrnk=[] + tmpFreq0,tmpCount,pval,tmpPthw=ordvals.pop()#the highest possible value + crank=1 + outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),str(pval),tmpPthw])) + totalnvals=len(ordvals) + cnt=0 + while totalnvals>cnt: + cnt+=1 + tmpFreq,tmpCount,pval,tmpPthw=ordvals.pop() + if tmpFreq!=tmpFreq0: + crank=len(outrnk)+1 + tmpFreq0=tmpFreq + outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),str(pval),tmpPthw])) + return outrnk + + +def main(): + parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2') + parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') + parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') + parser.add_argument('--loc_file',metavar='location file',type=str,help='location file') + parser.add_argument('--species',metavar='species',type=str,help='species') + #~Open arguments + class C(object): + pass + fulargs=C() + parser.parse_args(sys.argv[1:],namespace=fulargs) + #test input vars + inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn + locf,species=fulargs.loc_file,fulargs.species + #make a dictionary of valid genes + posKEGGclmn-=1 + Kgeneposcolmn-=1 + dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.split('\t')[posKEGGclmn] not in set(['U','N'])]) + for u in ['U','N']: + try: + a=dKEGGcPthws.pop(u) + except: + pass + getcontext().prec=2#set 2 decimal places + sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) + while True:#to correct names with more than one gene + try: + mgenes=sdGenes.pop() + pthwsAssotd=dKEGGcPthws.pop(mgenes) + mgenes=mgenes.split('.') + for eachg in mgenes: + dKEGGcPthws[eachg]=pthwsAssotd + except: + break + #~ Count genes + + location_file = LocationFile(locf) + prefix, kxml_dir_path, dict_file = location_file.get_values(species) + dPthContsTotls = {} + try: + with open(dict_file) as fh: + for line in fh: + line = line.rstrip('\r\n') + value, key = line.split('\t') + dPthContsTotls[key] = int(value) + except IOError, err: + print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror) + sys.exit(1) + + dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes + sdGenes=set(dKEGGcPthws.keys())#list of all genes + cntGens=0 + ltGens=len(sdGenes) + while cntGens> sys.stderr, "Error: pathway not found in database: '{0}'".format(eachP) + sys.exit(1) + cntGens+=1 + #~ Calculate Freqs. + ltfreqs=[] + cntAllKEGGinGnm=sum(dPthContsTotls.values()) + cntListKEGGinGnm=sum(dPthContsTmp.values()) + cntAllKEGGNOTinGnm=cntAllKEGGinGnm-cntListKEGGinGnm + for pthw in dPthContsTotls: + cntAllKEGGinpthw=Decimal(dPthContsTotls[pthw]) + try: + cntListKEGGinpthw=Decimal(dPthContsTmp[pthw]) + except: + cntListKEGGinpthw=Decimal('0') + cntAllKEGGNOTinpthw=cntAllKEGGinpthw-cntListKEGGinpthw + pval=pvalue(cntListKEGGinpthw,cntAllKEGGNOTinpthw,cntListKEGGinGnm,cntAllKEGGNOTinGnm) + + ltfreqs.append([(cntListKEGGinpthw/cntAllKEGGinpthw),cntListKEGGinpthw,Decimal(str(pval.two_tail))*1,pthw]) + #~ ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls] + tabllfreqs='\n'.join(rankd(ltfreqs)) + salef=open(outputf,'w') + salef.write(tabllfreqs) + salef.close() + return 0 + + +if __name__ == '__main__': + main() diff -r d6b961721037 -r 95a05c1ef5d5 rank_terms.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rank_terms.py Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GOFisher.py +# +# Copyright 2013 Oscar Reina +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import os +import sys +from fisher import pvalue +from decimal import Decimal,getcontext + +def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): + """ + """ + dGOTENSEMBLT={} + for eachl in open(inExtnddfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] + GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) + GOTs=GOTs.difference(set(['','U','N'])) + for GOT in GOTs: + try: + dGOTENSEMBLT[GOT].add(ENSEMBLT) + except: + dGOTENSEMBLT[GOT]=set([ENSEMBLT]) + #~ + ##dGOTENSEMBLT.pop('') + ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) + return dGOTENSEMBLT,ENSEMBLTGinGO + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the GO file + """ + sENSEMBLTSAPsinGO=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinGO: + sENSEMBLTSAPsinGO.add(ENSEMBLT) + return sENSEMBLTSAPsinGO + +def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the GO file. The terms in this list are: 'Go Term','# Genes in + the GO Term','# Genes in the list and in the GO Term','Enrichement + of the GO Term for genes in the input list','Genes in the input list + present in the GO term' + """ + getcontext().prec=2#set 2 decimal places + SAPs_all=len(sENSEMBLTSAPsinGO) + NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all + #~ + lp=len(dGOTENSEMBLT) + cnt=0 + #~ + ltfreqs=[] + for echGOT in dGOTENSEMBLT: + cnt+=1 + ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) + CntGO_All=len(dGOTENSEMBLT[echGOT]) + SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) + NoSAPs_GO=CntGO_All-SAPs_GO + pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) + #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) + ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + for perc,cnt_go,pval,goTerm in ltfreqs: + if perc + : Assess the enrichment/depletion of a gene set for GO terms + + + #set $t_col1_0 = int(str($t_col1)) - 1 + #set $t_col2_0 = int(str($t_col2)) - 1 + #set $g_col2_0 = int(str($g_col2)) - 1 + rank_terms.py --input "$input1" --columnENSEMBLT $t_col1_0 --inExtnddfile "$input2" --columnENSEMBLTExtndd $t_col2_0 --columnGOExtndd $g_col2_0 --output "$output" + + + + + + + + + + + + + + + + + +**Dataset formats** + +All of the input and output datasets are in tabular_ format. +The query dataset has a column containing ENSEMBL transcript codes for +the gene set of interest, while the background dataset has one column +with ENSEMBL transcript codes and another with GO terms, for some +larger universe of genes. +The output dataset is described below. +(`Dataset missing?`_) + +.. _tabular: ./static/formatHelp.html#tab +.. _Dataset missing?: ./static/formatHelp.html + +----- + +**What it does** + +Given a query set of genes from a larger background dataset, this tool +evaluates the statistical over- or under-representation of Gene Ontology +terms in the query set, using a two-tailed Fisher's exact test. + +The output contains a row for each GO term, with the following columns: + +1. count: the number of genes in the query set that are in this GO category +2. representation: the percentage of this category's genes (from the background dataset) that appear in the query set +3. ranking of this term, based on its representation ("1" is highest) +4. Fisher probability of enrichment/depletion of this GO category in the query dataset +5. GO term + + + diff -r d6b961721037 -r 95a05c1ef5d5 restore_attributes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/restore_attributes.xml Mon Mar 11 11:28:06 2013 -0400 @@ -0,0 +1,61 @@ + + : Fill in missing properties for a gd_snp dataset + + + cp.py "$dst" "$output" + + + + + + + + + + + + + +**Dataset formats** + +All of the input and output datasets are in gd_snp_ format. (`Dataset missing?`_) + +.. _gd_snp: ./static/formatHelp.html#gd_snp +.. _Dataset missing?: ./static/formatHelp.html + +----- + +**What it does** + +This tool copies metadata information from one SNP dataset to another, leaving +the actual SNP data itself unchanged. Datasets in gd_snp format have a number +of "extra" properties associated with them, such as the focus species (which +may be different from the reference assembly), names of individuals, column +numbers containing certain data fields, etc. These values are stored in the +dataset's metadata, in addition to the more usual attributes like dataset name, +assembly build, and so forth. You can see some of these by clicking on the +pencil icon for the dataset. + +The Genome Diversity tools need this information to perform their tasks. +However, these additional attributes may be lost if the datatype is changed. +For example, suppose you want to see which SNPs overlap some other dataset in +your history, like coding regions or TAL1 binding sites. The Intersect tool +only works on datasets that are in interval format, so you might use the Compute +tool to append a new column with the End position of the SNP (= Start + 1), +then use the pencil icon to change the datatype to "interval". This works +great for doing the intersection, but if you then want to run one of the Genome +Diversity tools on the resulting SNPs, there's a problem: you can change the +datatype back to gd_snp easily enough, but the extra attributes have been lost +in the conversion to interval. + +As long as the proper values of the lost attributes have not changed, then this +tool can restore them by copying from the old gd_snp dataset in your history. +In the above example, appending a column does not change the numbering of the +earlier columns, and deleting rows via Intersect does not affect the extra +attributes either. Note that all of the metadata is copied, not just the extra +attributes specific to gd_snp (though standard items like the assembly build, +the number of lines, and the name for the output dataset are updated +automatically by the Galaxy framework). + + + diff -r d6b961721037 -r 95a05c1ef5d5 static/images/gd_pathway_image.png Binary file static/images/gd_pathway_image.png has changed