# HG changeset patch # User Richard Burhans # Date 1351010512 14400 # Node ID f04f40a36cc8ab619c6bbab4554a4338653fa236 # Parent a3af29edcce203c88c4e2d777f3477224134711d Latest changes from Belinda and Cathy. Webb's updates to the Fst tools. diff -r a3af29edcce2 -r f04f40a36cc8 add_fst_column.xml --- a/add_fst_column.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/add_fst_column.xml Tue Oct 23 12:41:52 2012 -0400 @@ -1,4 +1,4 @@ - + : Compute a fixation index score for each SNP @@ -34,7 +34,8 @@ - + + @@ -62,30 +63,33 @@ **What it does** -The user specifies a SNP table and two "populations" of individuals, -both previously defined using the Specify Individuals tool. -No individual can be in both populations. Other choices are as follows. +The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. -Data source. The allele frequencies of a SNP in the two populations can be -estimated either by the total number of reads of each allele, or by adding -the frequencies inferred from genotypes of individuals in the populations. +Data source. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. + +After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. -After specifying the data source, the user sets lower bounds on amount -of data required at a SNP. For estimating the Fst using read counts, -the bound is the minimum count of reads of the two alleles in a population. -For estimations based on genotype, the bound is the minimum reported genotype -quality per individual. +The user specifies whether the SNPs that violate the lower bound should be ignored or the Fst set to -1. -The user specifies whether the SNPs that violate the lower bound should be -ignored or the Fst set to -1. +The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded. -The user specifies whether SNPs where both populations appear to be fixed -for the same allele should be retained or discarded. - -Finally, the user chooses which definition of Fst to use: Wright's original -definition or Weir's unbiased estimator. +Finally, the user chooses which definition of Fst to use: Wright's original definition, the Weir-Cockerham unbiased estimator, or the Reich-Patterson estimator. A column is appended to the SNP table giving the Fst for each retained SNP. +References: + +Sewall Wright (1951) The genetical structure of populations. Ann Eugen 15:323-354. + +B. S. Weir and C. Clark Cockerham (1984) Estimating F-statistics for the analysis of population structure. Evolution 38:1358-1370. + +Weir, B.S. 1996. Population substructure. Genetic data analysis II, pp. 161-173. Sinauer Associates, Sundand, MA. + +David Reich, Kumarasamy Thangaraj, Nick Patterson, Alkes L. Price, and Lalji Singh (2009) Reconstructing Indian population history. Nature 461:489-494, especially Supplement 2. + +Their effectiveness for computing FSTs when there are many SNPs but few individuals is discussed in the followoing paper. + +Eva-Maria Willing, Christine Dreyer, Cock van Oosterhout (2012) Estimates of genetic differentiation measured by FST do not necessarily require large sample sizes when using many SNP markers. PLoS One 7:e42649. + diff -r a3af29edcce2 -r f04f40a36cc8 aggregate_gd_indivs.xml --- a/aggregate_gd_indivs.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/aggregate_gd_indivs.xml Tue Oct 23 12:41:52 2012 -0400 @@ -22,11 +22,6 @@ - - - - - diff -r a3af29edcce2 -r f04f40a36cc8 average_fst.py --- a/average_fst.py Fri Sep 28 11:57:18 2012 -0400 +++ b/average_fst.py Tue Oct 23 12:41:52 2012 -0400 @@ -6,12 +6,12 @@ ################################################################################ -if len(sys.argv) < 12: +if len(sys.argv) < 11: print >> sys.stderr, "Usage" sys.exit(1) -input, p1_input, p2_input, data_source, min_total_count, discard_fixed, biased, output, shuffles, p0_input = sys.argv[1:11] -individual_metadata = sys.argv[11:] +input, p1_input, p2_input, data_source, min_total_count, discard_fixed, output, shuffles, p0_input = sys.argv[1:10] +individual_metadata = sys.argv[10:] try: shuffle_count = int(shuffles) @@ -51,7 +51,6 @@ args.append(data_source) args.append(min_total_count) args.append(discard_fixed) -args.append(biased) args.append(shuffles) columns = p1.column_list() diff -r a3af29edcce2 -r f04f40a36cc8 average_fst.xml --- a/average_fst.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/average_fst.xml Tue Oct 23 12:41:52 2012 -0400 @@ -1,8 +1,8 @@ - + : Estimate the relative fixation index between two populations - average_fst.py "$input" "$p1_input" "$p2_input" "$data_source.ds_choice" "$data_source.min_value" "$discard_fixed" "$biased" "$output" + average_fst.py "$input" "$p1_input" "$p2_input" "$data_source.ds_choice" "$data_source.min_value" "$discard_fixed" "$output" #if $use_randomization.ur_choice == '1' "$use_randomization.shuffles" "$use_randomization.p0_input" #else @@ -37,11 +37,6 @@ - - - - - @@ -67,7 +62,6 @@ - @@ -77,37 +71,36 @@ **What it does** -The user specifies a SNP table and two "populations" of individuals, -both previously defined using the Specify Individuals tool. -No individual can be in both populations. Other choices are as follows. +The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. + +Data source. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. + +After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the FST using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SMPs not meeting these lower bounds are ignored. + +The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded. -Data source. The allele frequencies of a SNP in the two populations can be -estimated either by the total number of reads of each allele, or by adding -the frequencies inferred from genotypes of individuals in the populations. +Finally, the user decides whether to use randomizations. If so, then the user specifies how many randomly generated population pairs (retaining the numbers of individuals of the originals) to generate, as well as the "population" of additional individuals (not in the first two populations) that can be used in the randomization process. -After specifying the data source, the user sets lower bounds on amount -of data required at a SNP. For estimating the Fst using read counts, -the bound is the minimum count of reads of the two alleles in a population. -For estimations based on genotype, the bound is the minimum reported genotype -quality per individual. SNPs not meeting these lower bounds are ignored. +The program prints the following measures of FST for the two populations. +1. The formulation by Sewall Wright (average over FSTs for all SNPs). +2. The Weir-Cockerham estimator (average over FSTs for all SNPs). +3. The Reich-Patterson estimator (average over FSTs for all SNPs). +4. The population-based Reich-Patterson estimator. -The user specifies whether SNPs where both populations appear to be fixed -for the same allele should be retained or discarded. +If randomizations were requested, it prints a summary for each of the four definitions of FST that includes the maximum and average value, and the highest-scoring population pair (if any scored higher than the two user-specified populations). + +References: -The user chooses which definition of Fst to use: Wright's original definition -or Weir's unbiased estimator. +Sewall Wright (1951) The genetical structure of populations. Ann Eugen 15:323-354. + +B. S. Weir and C. Clark Cockerham (1984) Estimating F-statistics for the analysis of population structure. Evolution 38:1358-1370. -Finally, the user decides whether to use randomizations. If so, then the -user specifies how many randomly generated population pairs (retaining -the numbers of individuals of the originals) to generate, as well as the -"population" of additional individuals (not in the first two populations) -that can be used in the randomization process. +Weir, B.S. 1996. Population substructure. Genetic data analysis II, pp. 161-173. Sinauer Associates, Sundand, MA. + +David Reich, Kumarasamy Thangaraj, Nick Patterson, Alkes L. Price, and Lalji Singh (2009) Reconstructing Indian population history. Nature 461:489-494, especially Supplement 2. -The program prints the average Fst for the original populations and the -number of SNPs used to compute it. If randomizations were requested, -it prints the average Fst for each randomly generated population pair, -ending with a summary that includes the maximum and average value, and the -highest-scoring population pair. +Their effectiveness for computing FSTs when there are many SNPs but few individuals is discussed in the followoing paper. +Eva-Maria Willing, Christine Dreyer, Cock van Oosterhout (2012) Estimates of genetic differentiation measured by FST do not necessarily require large sample sizes when using many SNP markers. PLoS One 7:e42649. diff -r a3af29edcce2 -r f04f40a36cc8 commits.log --- a/commits.log Fri Sep 28 11:57:18 2012 -0400 +++ b/commits.log Tue Oct 23 12:41:52 2012 -0400 @@ -1,3 +1,8 @@ + +:8703e16fca01 +cathy 2012-10-04 11:42 +More changes by Cathy, mostly in Phylogenetic Tree and Prepare Input, +including some UI adjustments. :7b775e5b68b4 cathy 2012-09-28 00:55 diff -r a3af29edcce2 -r f04f40a36cc8 coverage_distributions.xml --- a/coverage_distributions.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/coverage_distributions.xml Tue Oct 23 12:41:52 2012 -0400 @@ -87,7 +87,7 @@ ----- -**Examples** +**Example** - input:: diff -r a3af29edcce2 -r f04f40a36cc8 filter_gd_snp.xml --- a/filter_gd_snp.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/filter_gd_snp.xml Tue Oct 23 12:41:52 2012 -0400 @@ -26,7 +26,6 @@ - diff -r a3af29edcce2 -r f04f40a36cc8 genome_diversity/src/Fst_ave.c --- a/genome_diversity/src/Fst_ave.c Fri Sep 28 11:57:18 2012 -0400 +++ b/genome_diversity/src/Fst_ave.c Tue Oct 23 12:41:52 2012 -0400 @@ -1,50 +1,69 @@ -/* Fst_ave -- determine the average Fst values between two specified populations -* and between two random populations +/* Fst_ave -- determine four FST values between two specified populations, +* and optionally between several pairs of random populations * * argv{1] = a Galaxy SNP table. For each of several individuals, the table * has four columns (#A, #B, genotype, quality). -* argv[2] = 1 if Fst is estimated from SAMtools genotypes; 0 means use +* argv[2] = 1 if FST is estimated from SAMtools genotypes; 0 means use * read-coverage data. -* argv[3] = lower bound, for individual quality value if argv[2] = 1 +* argv[3] = lower bound, for individual quality value if argv[2] = 1, * or for total number of reads per population if argv[2] = 0. * SNPs not satisfying these lower bounds are ignored. * argv[4] = 1 to discard SNPs that appear fixed in the two populations -* argv[5] = 1 for unbiased estimator, else 0 for the original Wright form. -* argv[6] = k => 0 says report the average Fst and the largest average over k -* randomly chosen splits into two populations of those sizes -* argv[7], argv[8], ..., have the form "13:1", "13:2" or "13:0", meaning +* argv[5] = k says report the maximum and average FST over k randomly +* chosen splits into two populations of two original sizes +* argv[6], argv[7], ..., have the form "13:1", "13:2" or "13:0", meaning * that the 13th and 14th columns (base 1) give the allele counts -* for an individual that is in population 1, in population 2, -* or in neither population. +* (and column 15 gives the genotype) for an individual that is in +* population 1, in population 2, or in neither population. What it does on Galaxy -The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to select individuals from a SNP table. No individual can be in both populations. Other choices are as follows. +The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. -Data soure. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. +Data source. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. -After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SMPs not meeting these lower bounds are ignored. +After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the FST using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SMPs not meeting these lower bounds are ignored. The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded. -The user chooses which definition of Fst to use: Wright's original definition or Weir's unbiased estimator. +Finally, the user decides whether to use randomizations. If so, then the user specifies how many randomly generated population pairs (retaining the numbers of individuals of the originals) to generate, as well as the "population" of additional individuals (not in the first two populations) that can be used in the randomization process. + +The program prints the following measures of FST for the two populations. +1. The formulation by Sewall Wright (average over FSTs for all SNPs). +2. The Weir-Cockerham estimator (average over FSTs for all SNPs). +3. The Reich-Patterson estimator (average over FSTs for all SNPs). +4. The population-based Reich-Patterson estimator. + +If randomizations were requested, it prints a summary for each of the four definitions of FST that includes the maximum and average value, and the highest-scoring population pair (if any scored higher than the two user-specified populations). + +References: -Finally, the user decides whether to use randomizations. If so, then the user specifies how many randomly generated population pairs (retaining the numbers of individuals of the originals) to generate, as well as the "population" of additional individuals (not in the first two popuations) that can be used in the ransmization process. +Sewall Wright (1951) The genetical structure of populations. Ann Eugen 15:323-354. + +B. S. Weir and C. Clark Cockerham (1984) Estimating F-statistics for the analysis of population structure. Evolution 38:1358–1370. + +Weir, B.S. 1996. Population substructure. Genetic data analysis II, pp. 161–173. Sinauer Associates, Sunderland, MA. -The program prints the average Fst for the original populations and the number of SNPs used to compute it. If randomizations were requested, it prints the average Fst for each randomly generated population pair, ending with a summary that includes the maximum and average value, and the highest-scoring population pair. +David Reich, Kumarasamy Thangaraj, Nick Patterson, Alkes L. Price, and Lalji Singh (2009) Reconstructing Indian population history. Nature 461:489-494, especially Supplement 2. + +Their effectiveness for computing FSTs when there are many SNPs but few individuals is discussed in the followoing paper. + +Eva-Maria Willing, Christine Dreyer, Cock van Oosterhout (2012) Estimates of genetic differentiation measured by FST do not necessarily require large sample sizes when using many SNP markers. PLoS One 7:e42649. + */ #include "lib.h" #include "Fst_lib.h" -// maximum legth of a line from the table +// maximum length of a line from the table #define MOST 5000 // information about the specified individuals // x is an array of nI values 0, 1, or 2; // shuffling x creates random "populations" -int col[MOST], x[MOST], best_x[MOST]; -int nI, lower_bound, unbiased, discard, genotypes, nsnp; +int col[MOST], x[MOST]; +int nI, lower_bound, discard, genotypes, nsnp, nfail; +double F_wright, F_weir, F_reich, N_reich, D_reich; // each SNP has an array of counts struct count { @@ -57,16 +76,20 @@ struct snp *next; } *start, *last; -// given the two populations specified by x[], return the average Fst -double ave_Fst() { - double tot_Fst; +/* For each of wright, weir and reich, we observe allele counts A1 and A2 +* for one allele in the two populations, and B1 and B2 for the other allele. +*/ + +// given the two populations specified by x[], compute four corresponding FSTs +void pop_Fst() { + double N, D; struct snp *s; int i, A1, B1, A2, B2, too_few; // scan the SNPs - tot_Fst = 0.0; - nsnp = 0; + F_wright = F_weir = F_reich = N_reich = D_reich = 0.0; + nsnp = nfail = 0; for (s = start; s != NULL; s = s->next) { // get counts for the two populations at this SNP for (A1 = B1 = A2 = B2 = i = 0; i < nI; ++i) { @@ -85,10 +108,30 @@ too_few = (genotypes ? 1 : lower_bound); if (A1+B1 >= too_few && A2+B2 >= too_few) { ++nsnp; - tot_Fst += Fst(A1, B1, A2, B2, unbiased); + wright(A1, A2, B1, B2, &N, &D); + if (D != 0.0) + F_wright += N/D; + else + ++nfail; + weir(A1, A2, B1, B2, &N, &D); + if (D != 0.0) + F_weir += N/D; + else + ++nfail; + reich(A1, A2, B1, B2, &N, &D); + N_reich += N; + D_reich += D; + if (D != 0.0) + F_reich += N/D; + else + ++nfail; } } - return tot_Fst/nsnp; + F_wright /= nsnp; + F_weir /= nsnp; + N_reich /= nsnp; + D_reich /= nsnp; + F_reich /= nsnp; } /* shuffle the values x[0], x[1], ... , x[nI-1]; @@ -110,12 +153,14 @@ int main(int argc, char **argv) { FILE *fp; char *p, *z = "\t\n", buf[MOST]; - int X[MOST], nshuff, n, i, j, k, saw[3], larger, all = 1; + int X[MOST], nshuff, n, i, j, k, saw[3], larger0, larger1, larger2, + larger3, best_x0[MOST], best_x1[MOST], best_x2[MOST], best_x3[MOST]; struct snp *new; - double F, F1, largest_F, tot_F; + double F, F0, F1, F2, F3, tot_F0, tot_F1, tot_F2, tot_F3, + largest_F0, largest_F1, largest_F2, largest_F3; if (argc < 7) - fatal("args: table data-source lower_bound discard? unbiased? #shuffles n:1 m:2 ..."); + fatal("args: table data-source lower_bound discard? #shuffles n:1 m:2 ..."); // handle command-line arguments genotypes = atoi(argv[2]); @@ -123,12 +168,10 @@ if (!genotypes && lower_bound <= 0) fatal("minimum coverage should exceed 0"); discard = atoi(argv[4]); - unbiased = atoi(argv[5]); - nshuff = atoi(argv[6]); + nshuff = atoi(argv[5]); saw[0] = saw[1] = saw[2] = 0; // populations 1 and 2 must be disjoint - // population 0 can be replaced by population 1 or 2 - for (i = 7; i < argc; ++i) { + for (i = 6; i < argc; ++i) { if (sscanf(argv[i], "%d:%d", &j, &k) != 2) fatalf("not like 13:2 : %s", argv[i]); if (k < 0 || k > 2) @@ -189,39 +232,129 @@ } fclose(fp); - F1 = ave_Fst(); - printf("average Fst is %5.5f, using %d SNPs\n", F1, nsnp); + pop_Fst(); + printf("Using %d SNPs, we compute:\n", nsnp); + printf("Average Wright FST is %5.5f.\n", F0 = F_wright); + printf("Average Weir-Cockerham FST is %5.5f.\n", F1 = F_weir); + printf("Average Reich-Patterson FST is %5.5f.\n", F2 = F_reich); + printf("The population-based Reich-Patterson Fst is %5.5f.\n", + F3 = N_reich/D_reich); + if (nfail > 0) + printf("WARNING: %d of %d FSTs could not be computed\n", + nfail, 3*nsnp); + if (nshuff == 0) + return 0; + + // do the following only if randomization is requested for (j = 0; j < nI; ++j) - best_x[j] = x[j]; - for (tot_F = largest_F = 0.0, larger = i = 0; i < nshuff; ++i) { + best_x0[j] = best_x1[j] = best_x2[j] = best_x3[j] = x[j]; + tot_F0 = tot_F1 = tot_F2 = tot_F3 = + largest_F0 = largest_F1 = largest_F2 = largest_F3 = 0.0; + larger0 = larger1 = larger2 = larger3 = 0; + for (i = 0; i < nshuff; ++i) { shuffle(); - if ((F = ave_Fst()) > F1) - ++larger; - if (F > largest_F) { - largest_F = F; + pop_Fst(); + + // Wright + if ((F = F_wright) > F0) + ++larger0; + if (F > largest_F0) { + largest_F0 = F; for (j = 0; j < nI; ++j) - best_x[j] = x[j]; + best_x0[j] = x[j]; } - tot_F += F; + tot_F0 += F; +/* if (all) // make this optional? printf("%d: %f\n", i+1, F); +*/ + + // Weir + if ((F = F_weir) > F1) + ++larger1; + if (F > largest_F1) { + largest_F1 = F; + for (j = 0; j < nI; ++j) + best_x1[j] = x[j]; + } + tot_F1 += F; + + // Riech average + if ((F = F_reich) > F2) + ++larger2; + if (F > largest_F2) { + largest_F2 = F; + for (j = 0; j < nI; ++j) + best_x2[j] = x[j]; + } + tot_F2 += F; + + // Reich population + if ((F = (N_reich/D_reich)) > F3) + ++larger3; + if (F > largest_F3) { + largest_F3 = F; + for (j = 0; j < nI; ++j) + best_x3[j] = x[j]; + } + tot_F3 += F; } - if (nshuff > 0) { - printf("%d of %d random groupings had a larger average Fst\n", - larger, nshuff); - printf("largest = %5.5f, mean = %5.5f\n", largest_F, - tot_F/nshuff); - if (largest_F > F1) { - printf("first columns for the best two populations:\n"); - for (i = 0; i < nI; ++i) - if (best_x[i] == 1) - printf("%d ", col[i]); - printf("and\n"); - for (i = 0; i < nI; ++i) - if (best_x[i] == 2) - printf("%d ", col[i]); - putchar('\n'); - } + printf("\nOf %d random groupings:\n", nshuff); + printf("%d had a larger average Wright FST (max %5.5f, mean %5.5f)\n", + larger0, largest_F0, tot_F0/nshuff); + if (largest_F0 > F0) { + printf("first columns for the best two populations:\n"); + for (i = 0; i < nI; ++i) + if (best_x0[i] == 1) + printf("%d ", col[i]); + printf("and\n"); + for (i = 0; i < nI; ++i) + if (best_x0[i] == 2) + printf("%d ", col[i]); + putchar('\n'); + putchar('\n'); + } + printf("%d had a larger average Weir-Cockerham FST (max %5.5f, mean %5.5f)\n", + larger1, largest_F1, tot_F1/nshuff); + if (largest_F1 > F1) { + printf("first columns for the best two populations:\n"); + for (i = 0; i < nI; ++i) + if (best_x1[i] == 1) + printf("%d ", col[i]); + printf("and\n"); + for (i = 0; i < nI; ++i) + if (best_x1[i] == 2) + printf("%d ", col[i]); + putchar('\n'); + putchar('\n'); + } + printf("%d had a larger average Reich-Patterson FST (max %5.5f, mean %5.5f)\n", + larger2, largest_F2, tot_F2/nshuff); + if (largest_F2 > F2) { + printf("first columns for the best two populations:\n"); + for (i = 0; i < nI; ++i) + if (best_x2[i] == 1) + printf("%d ", col[i]); + printf("and\n"); + for (i = 0; i < nI; ++i) + if (best_x2[i] == 2) + printf("%d ", col[i]); + putchar('\n'); + putchar('\n'); + } + printf("%d had a larger Reich-Patterson population FST (max %5.5f, mean %5.5f)\n", + larger3, largest_F3, tot_F3/nshuff); + if (largest_F3 > F3) { + printf("first columns for the best two populations:\n"); + for (i = 0; i < nI; ++i) + if (best_x3[i] == 1) + printf("%d ", col[i]); + printf("and\n"); + for (i = 0; i < nI; ++i) + if (best_x3[i] == 2) + printf("%d ", col[i]); + putchar('\n'); + putchar('\n'); } return 0; diff -r a3af29edcce2 -r f04f40a36cc8 genome_diversity/src/Fst_column.c --- a/genome_diversity/src/Fst_column.c Fri Sep 28 11:57:18 2012 -0400 +++ b/genome_diversity/src/Fst_column.c Tue Oct 23 12:41:52 2012 -0400 @@ -9,7 +9,7 @@ * argv[5] = 1 to retain SNPs that fail to satisfy the lower bound and set * Fst = -1; delete them if argv[4] = 0. * argv[6] = 1 to discard SNPs that appear fixed in the two populations -* argv[7] = 1 for unbiased estimator, else 0 for the original Wright form. +* argv[7] = 0 for the original Wright form, 1 for Weir, 2 for Reich * argv[8], argv[9], ..., have the form "13:1" or "13:2", meaning that * the 13th, 14th, and 15th columns (base 1) give the allele counts * and genotype for an individual that is in population 1 or @@ -17,9 +17,9 @@ What It Does on Galaxy -The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to select individuals from a SNP table. No individual can be in both populations. Other choices are as follows. +The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. -Data soure. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. +Data source. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. @@ -27,10 +27,24 @@ The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded. -Finally, the user chooses which definition of Fst to use: Wright's original definition or Weir's unbiased estimator. +Finally, the user chooses which definition of Fst to use: Wright's original definition, the Weir-Cockerham unbiased estimator, or the Reich-Patterson estimator. A column is appended to the SNP table giving the Fst for each retained SNP. +References: + +Sewall Wright (1951) The genetical structure of populations. Ann Eugen 15:323-354. + +B. S. Weir and C. Clark Cockerham (1984) Estimating F-statistics for the analysis of population structure. Evolution 38:1358–1370. + +Weir, B.S. 1996. Population substructure. Genetic data analysis II, pp. 161–173. Sinauer Associates, Sunderland, MA. + +David Reich, Kumarasamy Thangaraj, Nick Patterson, Alkes L. Price, and Lalji Singh (2009) Reconstructing Indian population history. Nature 461:489-494, especially Supplement 2. + +Their effectiveness for computing FSTs when there are many SNPs but few individuals is discussed in the followoing paper. + +Eva-Maria Willing, Christine Dreyer, Cock van Oosterhout (2012) Estimates of genetic differentiation measured by FST do not necessarily require large sample sizes when using many SNP markers. PLoS One 7:e42649. + */ #include "lib.h" @@ -48,7 +62,7 @@ char *p, *z = "\t\n", buf[MOST], trash[MOST]; int X[MOST], min_cov, min_qual, retain, discard, unbiased, genotypes, n, i, g, A1, B1, A2, B2, saw[3], x1, y1, x2, y2; - double F; + double F, N, D; if (argc < 7) fatal("args: table data-source lower-bound retain? discard? unbiased? n:1 m:2 ..."); @@ -119,8 +133,20 @@ continue; // not variable in the two populations if (x1+y1 < min_cov || x2+y2 < min_cov) F = -1.0; - else - F = Fst(A1, B1, A2, B2, unbiased); + else { + if (unbiased == 0) + wright(A1, A2, B1, B2, &N, &D); + else if (unbiased == 1) + weir(A1, A2, B1, B2, &N, &D); + else if (unbiased == 2) + reich(A1, A2, B1, B2, &N, &D); + else + fatal("impossible value of 'unbiased'"); + if (D == 0.0) + continue; // ignore these SNPs + else + F = N/D; + } if (F == -1.0 && !retain) continue; if ((p = strchr(buf, '\n')) != NULL) diff -r a3af29edcce2 -r f04f40a36cc8 genome_diversity/src/Fst_lib.c --- a/genome_diversity/src/Fst_lib.c Fri Sep 28 11:57:18 2012 -0400 +++ b/genome_diversity/src/Fst_lib.c Tue Oct 23 12:41:52 2012 -0400 @@ -1,49 +1,73 @@ -// procedure to compute either Wright's Fst or an unbiased estimator of if +// This file contains three procedures for computing different variants of Fst. + +#include "Fst_lib.h" -#include "lib.h" -// Wright's Fst -static double Wright(double f1, double f2) { +/* For each of wright, weir and reich, args 1 and 2 are counts for one +* allele in the two populations; args 3 and 4 are counts for the other allele. +* The numerator and denominator of the computed Fst are returned through +* args 5 and 6. +*/ + +void wright(int A1, int A2, int B1, int B2, double *N, double *D) { + double a1 = A1, a2 = A2, b1 = B1, b2 = B2, n1, n2, p1, p2; + double - f, // frequency in the pooled population + p, // frequency in the pooled population H_ave, // average of HWE heterogosity in the two populations H_all; // HWE heterozygosity in the pooled popuations - H_ave = f1*(1.0 - f1) + f2*(1.0 - f2); - f = (f1 + f2)/2.0; - if (f == 0.0 || f == 1.0) - return 0.0; - H_all = 2.0*f*(1.0 - f); - return (H_all - H_ave) / H_all; + n1 = a1+b1; + n2 = a2+b2; + if (n1 == 0.0 || n2 == 0.0) { + // let the calling program handle it + *N = *D = 0.0; + return; + } + p1 = a1/n1; + p2 = a2/n2; + H_ave = p1*(1.0 - p1) + p2*(1.0 - p2); + p = (p1 + p2)/2.0; + H_all = 2.0*p*(1.0 - p); + *N = H_all - H_ave; + *D = H_all; } -/* unbiased estimator of Fst from: - Weir, B.S. and Cockerham, C.C. 1984. Estimating F-statistics for the - analysis of population structure. Evolution 38: 1358–1370. -as interpreted by: - Akey, J.M., Zhang, G., Zhang, K., Jin, L., and Shriver, M.D. 2002. - Interrogating a high-density SNP map for signatures of natural - selection. Genome Res. 12: 1805–1814. -*/ -static double Weir(int n1, double p1, int n2, double p2) { - double F, p_bar, nc, MSP, MSG, N = n1 + n2; +void weir(int A1, int A2, int B1, int B2, double *N, double *D) { + double a1 = A1, a2 = A2, b1 = B1, b2 = B2, n1, n2, p1, p2, + n_tot, p_bar, nc, MSP, MSG; - if (p1 == p2) - return 0.0; - MSG = (n1*p1*(1.0-p1) + n2*p2*(1.0-p2))/(N-1.0); - p_bar = (n1*p1 + n2*p2)/N; + n1 = a1+b1; + n2 = a2+b2; + if (n1 == 0.0 || n2 == 0.0) { + // let the calling program handle it + *N = *D = 0.0; + return; + } + n_tot = n1 + n2; + p1 = a1/n1; + p2 = a2/n2; + + MSG = (n1*p1*(1.0-p1) + n2*p2*(1.0-p2))/(n_tot-2.0); + p_bar = (n1*p1 + n2*p2)/n_tot; MSP = n1*(p1-p_bar)*(p1-p_bar) + n2*(p2-p_bar)*(p2-p_bar); - nc = N - (double)(n1*n1 + n2*n2)/N; - F = (MSP - MSG) / (MSP + (nc-1)*MSG); - if (F < 0.0) - F = 0.0; - return F; + nc = n_tot - (n1*n1 + n2*n2)/n_tot; + *N = MSP - MSG; + *D = MSP + (nc-1)*MSG; } -double Fst(int nA1, int na1, int nA2, int na2, int unbiased) { - double p1, p2; +void reich(int A1, int A2, int B1, int B2, double *N, double *D) { + double a1 = A1, a2 = A2, b1 = B1, b2 = B2, n1, n2, h1, h2, x; - p1 = (double)nA1 / (double)(nA1+na1); - p2 = (double)nA2 / (double)(nA2+na2); - - return (unbiased ? Weir(nA1+na1, p1, nA2+na2, p2) : Wright(p1, p2)); + n1 = a1+b1; + n2 = a2+b2; + if (n1<=1 || n2<=1) { + // let the calling program handle it + *N = *D = 0.0; + return; + } + h1 = (a1*(n1-a1)) / (n1*(n1-1)); + h2 = (a2*(n2-a2)) / (n2*(n2-1)); + x = a1/n1 - a2/n2; + *N = x*x - h1/n1 - h2/n2; + *D = *N + h1 + h2; } diff -r a3af29edcce2 -r f04f40a36cc8 genome_diversity/src/Fst_lib.h --- a/genome_diversity/src/Fst_lib.h Fri Sep 28 11:57:18 2012 -0400 +++ b/genome_diversity/src/Fst_lib.h Tue Oct 23 12:41:52 2012 -0400 @@ -1,8 +1,11 @@ -/* return either Sewall Wright's Fst or its Weir unbiased estimator -* parameters are as follows -* 1, 2 : frequencies of the two alleles in population 1 -* 3, 4 : frequencies of the two alleles in population 2 -* 5 : 0 = return Wright's formulation, 1 = return unbiased estimator +// Fst_lib.h + +/* For each of wright, weir and reich, args 1 and 2 are counts for one +* allele in the two populations; args 3 and 4 are counts for the other allele. +* The numerator and denominator of the computed Fst are returned through +* args 5 and 6. */ -double Fst(int, int, int, int, int); +void wright(int, int , int , int , double * , double * ); +void weir(int, int , int , int , double * , double * ); +void reich(int, int , int , int , double * , double * ); diff -r a3af29edcce2 -r f04f40a36cc8 phylogenetic_tree.xml --- a/phylogenetic_tree.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/phylogenetic_tree.xml Tue Oct 23 12:41:52 2012 -0400 @@ -31,7 +31,7 @@ - + @@ -41,16 +41,17 @@ - + - + - + @@ -133,49 +134,49 @@ The input parameters are: SNP dataset - A table of SNPs for various individuals, in gd_snp format. + A table of SNPs for various individuals, in gd_snp format. Individuals - By default all individuals are included in the analysis, but this can - optionally be restricted to a subset that has been defined using the - Specify Individuals tool. + By default all individuals are included in the analysis, but this can + optionally be restricted to a subset that has been defined using the + Specify Individuals tool. -Minimum coverage - For each pair of individuals, the tool looks for informative SNPs, i.e., - where the sequence data for both individuals is adequate according to - some criterion. Specifying, say, 7 for this option instructs the tool - to consider only SNPs with coverage at least 7 in both individuals - when estimating their "genetic distance". +Minimum SNP coverage + For each pair of individuals, the tool looks for informative SNPs, i.e., + where the sequence data for both individuals is adequate. Specifying, + say, 7 for this option instructs the tool to consider only SNPs with + at least 7 reads in each of the two individuals (regardless of the + alleles) when estimating their genetic distance. -Minimum quality - Specifying, say, 37 for this option instructs the tool to consider - only SNPs with SAMtools quality value at least 37 in both individuals - when estimating their "genetic distance". +Minimum SNP quality + Specifying, say, 37 for this option instructs the tool to consider + only SNPs with a quality score of at least 37 in both individuals + when estimating their genetic distance. Include reference sequence - For gd_snp datasets containing columns for a reference sequence, the - user can ask that the reference be indicated in the tree, to help with - rooting it. If the dataset has no reference columns, this option has - no effect. + For gd_snp datasets containing columns for a reference sequence, the + user can ask that the reference be indicated in the tree, to help with + rooting it. If the dataset has no reference columns, this option has + no effect. -Data source - The genetic distance between two individuals at a given SNP can - be estimated two ways. One method is to use the absolute value of the - difference in the frequency of the first allele (or equivalently, the - second allele). For instance, if the first individual has 5 reads of - each allele and the second individual has respectively 3 and 6 reads, - then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that - SNP. The other approach is to use the SAMtools genotypes to estimate - the difference in the number of occurrences of the first allele. - For instance, if the two genotypes are 2 and 1, i.e., the individuals - are estimated to have respectively 2 and 1 occurrences of the first - allele at this location, then the distance is 1 (the absolute value - of the difference of the two numbers). +Distance metric + The genetic distance between two individuals at a given SNP can + be estimated two ways. One method is to use the absolute value of the + difference in the frequency of the first allele (or equivalently, the + second allele). For instance, if the first individual has 5 reads of + each allele and the second individual has respectively 3 and 6 reads, + then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that + SNP. The other approach is to use the genotype calls to estimate + the difference in the number of occurrences of the first allele. + For instance, if the two genotypes are 2 and 1, i.e., the individuals + are estimated to have respectively 2 and 1 occurrences of the first + allele at this location, then the distance is 1 (the absolute value + of the difference of the two numbers). Output options - The final four options apply mostly to the graphical drawing of the - tree, except that the branch lengths are also added to the Newick text - file. + The final four options apply mostly to the graphical drawing of the + tree, except that the branch lengths are also added to the Newick text + file. ----- diff -r a3af29edcce2 -r f04f40a36cc8 prepare_population_structure.xml --- a/prepare_population_structure.xml Fri Sep 28 11:57:18 2012 -0400 +++ b/prepare_population_structure.xml Tue Oct 23 12:41:52 2012 -0400 @@ -19,13 +19,10 @@ - - - - - + + @@ -34,6 +31,9 @@ + + + @@ -62,11 +62,8 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. It is important -for the Individuals datasets to have unique names; rename them if -necessary to make them unique. These names are used by the later tools in -the graphical displays. -The output dataset is gd_ped_. (`Dataset missing?`_) +The input datasets are in gd_snp_ and gd_indivs_ formats. +The output dataset is in gd_ped_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp .. _gd_indivs: ./static/formatHelp.html#gd_indivs @@ -77,16 +74,22 @@ **What it does** -The tool converts a gd_snp dataset into two tables, called "admix.map" and -"admix.ped", needed for estimating the population structure. The user -can read or download those files, or simply pass this tool's output on to -other programs. The user imposes conditions on which SNPs to consider, -such as the minimum coverage and/or quality value for every individual, -or the distance to the closest SNP in the same contig (as named in the -first column of the SNP table). A useful piece of information produced -by the tool is the number of SNPs meeting those conditions, which can -be found by clicking on the eye icon in the history panel after the program -runs. +This tool converts a gd_snp dataset into the format needed for estimating +the population structure. You can select the individuals to be included, +by using "population" datasets created via the Specify Individuals tool. +(It is important for these population datasets to have distinguishable names, +since they will be stored in the output's metadata so that subsequent tools +can use them as labels. If necessary, rename the datasets to give them +distinct and meaningful names before running this tool.) + +You can also filter the SNPs, based on criteria such as minimum coverage +(a qualifying SNP must have at least this many reads for every included +individual), minimum quality score (for every included individual), and/or +minimum spacing (SNPs that are too close together on the same chromosome or +scaffold are discarded). In addition to producing the filtered and formatted +.map and .ped files for subsequent analysis, the tool reports the number of +SNPs meeting these conditions, which can be seen by clicking on the eye icon +in the history panel after the program runs. ----- @@ -94,25 +97,36 @@ - input:: - Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 - Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0 - Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 + Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 + Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0 + Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 etc. -- output map file:: +- output cover page:: + + Prepare to look for population structure Galaxy Composite Dataset + Output completed: 2012-10-01 04:09:36 PM + + Outputs + * admix.ped (link) + * admix.map (link) + * Using 222 of 400 SNPs - 1 snp1 0 2 - 1 snp3 0 4 - 1 snp4 0 5 - 1 snp5 0 6 - 1 snp6 0 7 - 1 snp7 0 8 - 1 snp8 0 9 - 1 snp9 0 10 + Inputs + * Minimum reads covering a SNP, per individual: 6 + * Minimum quality value, per individual: 0 + * Minimum spacing between SNPs on the same scaffold: 0 -- output ped file:: - - PB1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + Populations + * Pop. A + 1. PB1 + 2. PB2 + * Pop. B + 1. PB3 + 2. PB4 + * Pop. C + 1. PB6 + 2. PB8