Mercurial > repos > miller-lab > genome_diversity
changeset 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
line wrap: on
line diff
--- a/add_fst_column.py Wed May 22 15:58:18 2013 -0400 +++ b/add_fst_column.py Tue May 28 16:24:19 2013 -0400 @@ -18,8 +18,8 @@ print >> sys.stderr, "Usage" sys.exit(1) -input, p1_input, p2_input, genotypes, min_reads, min_qual, retain, discard_fixed, biased, output = sys.argv[1:11] -individual_metadata = sys.argv[11:] +input, p1_input, p2_input, input_type, genotypes, min_reads, min_qual, retain, discard_fixed, biased, output = sys.argv[1:12] +individual_metadata = sys.argv[12:] p_total = Population() p_total.from_tag_list(individual_metadata) @@ -52,10 +52,14 @@ columns = p1.column_list() for column in columns: + if input_type == 'gd_genotype': + column = int(column) - 2 args.append('{0}:1'.format(column)) columns = p2.column_list() for column in columns: + if input_type == 'gd_genotype': + column = int(column) - 2 args.append('{0}:2'.format(column)) fh = open(output, 'w')
--- a/add_fst_column.xml Wed May 22 15:58:18 2013 -0400 +++ b/add_fst_column.xml Tue May 28 16:24:19 2013 -0400 @@ -1,8 +1,19 @@ -<tool id="gd_add_fst_column" name="Per-SNP FSTs" version="1.1.0"> +<tool id="gd_add_fst_column" name="Per-SNP FSTs" version="1.2.0"> <description>: Compute a fixation index score for each SNP</description> <command interpreter="python"> - add_fst_column.py "$input" "$p1_input" "$p2_input" "$data_source" "$min_reads" "$min_qual" "$retain" "$discard_fixed" "$biased" "$output" + add_fst_column.py "$input" "$p1_input" "$p2_input" + #if $input_type.choice == '0' + "gd_snp" "$input_type.data_source.choice" + #if $input_type.data_source.choice == '0' + "$input_type.data_source.min_reads" "$input_type.data_source.min_qual" + #else if $input_type.data_source.choice == '1' + "0" "0" + #end if + #else if $input_type.choice == '1' + "gd_genotype" "1" "0" "0" + #end if + "$retain" "$discard_fixed" "$biased" "$output" #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) "$arg" @@ -10,18 +21,35 @@ </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <conditional name="input_type"> + <param name="choice" type="select" format="integer" label="Input format"> + <option value="0" selected="true">gd_snp</option> + <option value="1">gd_genotype</option> + </param> + + <when value="0"> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + + <conditional name="data_source"> + <param name="choice" type="select" format="integer" label="Frequency metric"> + <option value="0">sequence coverage</option> + <option value="1" selected="true">estimated genotype</option> + </param> + <when value="0"> + <param name="min_reads" type="integer" min="0" value="0" label="Minimum total read count for a population" /> + <param name="min_qual" type="integer" min="0" value="0" label="Minimum individual genotype quality" /> + </when> + <when value="1"/> + </conditional> + </when> + <when value="1"> + <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> + </when> + </conditional> + <param name="p1_input" type="data" format="gd_indivs" label="Population 1 individuals" /> <param name="p2_input" type="data" format="gd_indivs" label="Population 2 individuals" /> - <param name="data_source" type="select" format="integer" label="Frequency metric"> - <option value="0">sequence coverage</option> - <option value="1" selected="true">estimated genotype</option> - </param> - - <param name="min_reads" type="integer" min="0" value="0" label="Minimum total read count for a population" /> - <param name="min_qual" type="integer" min="0" value="0" label="Minimum individual genotype quality" /> - <param name="retain" type="select" label="If a SNP is below minimum"> <option value="0" selected="true">skip SNP</option> <option value="1">set FST = -1</option> @@ -41,7 +69,7 @@ </inputs> <outputs> - <data name="output" format="gd_snp" metadata_source="input" /> + <data name="output" format="input" format_source="input" metadata_source="input" /> </outputs> <tests> @@ -63,10 +91,11 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. -The output dataset is in gd_snp_ format. (`Dataset missing?`_) +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. +The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _Dataset missing?: ./static/formatHelp.html @@ -76,7 +105,7 @@ The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. -Frequency metric. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. +Frequency metric. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele (if the table is in gd_snp format, but not with gd_genotype), or by adding the frequencies inferred from genotypes of individuals in the populations. After specifying the frequency metric, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual.
--- a/average_fst.py Wed May 22 15:58:18 2013 -0400 +++ b/average_fst.py Tue May 28 16:24:19 2013 -0400 @@ -6,12 +6,12 @@ ################################################################################ -if len(sys.argv) < 11: +if len(sys.argv) < 12: print >> sys.stderr, "Usage" sys.exit(1) -input, p1_input, p2_input, data_source, min_total_count, discard_fixed, output, shuffles, p0_input = sys.argv[1:10] -individual_metadata = sys.argv[10:] +input, p1_input, p2_input, input_type, data_source, min_total_count, discard_fixed, output, shuffles, p0_input = sys.argv[1:11] +individual_metadata = sys.argv[11:] try: shuffle_count = int(shuffles) @@ -55,15 +55,21 @@ columns = p1.column_list() for column in columns: + if input_type == 'gd_genotype': + column = int(column) - 2 args.append('{0}:1'.format(column)) columns = p2.column_list() for column in columns: + if input_type == 'gd_genotype': + column = int(column) - 2 args.append('{0}:2'.format(column)) if p0 is not None: columns = p0.column_list() for column in columns: + if input_type == 'gd_genotype': + column = int(column) - 2 args.append('{0}:0'.format(column)) fh = open(output, 'w')
--- a/average_fst.xml Wed May 22 15:58:18 2013 -0400 +++ b/average_fst.xml Tue May 28 16:24:19 2013 -0400 @@ -1,12 +1,23 @@ -<tool id="gd_average_fst" name="Overall FST" version="1.2.0"> +<tool id="gd_average_fst" name="Overall FST" version="1.3.0"> <description>: Estimate the relative fixation index between two populations</description> <command interpreter="python"> - average_fst.py "$input" "$p1_input" "$p2_input" "$data_source.ds_choice" "$data_source.min_value" "$discard_fixed" "$output" - #if $use_randomization.ur_choice == '1' + average_fst.py "$input" "$p1_input" "$p2_input" + #if $input_type.choice == '0' + "gd_snp" "$input_type.data_source.choice" + #if $input_type.data_source.choice == '0' + "$input_type.data_source.min_value" + #else if $input_type.data_source.choice == '1' + "1" + #end if + #else if $input_type.choice == '1' + "gd_genotype" "1" "1" + #end if + "$discard_fixed" "$output" + #if $use_randomization.choice == '0' + "0" "/dev/null" + #else if $use_randomization.choice == '1' "$use_randomization.shuffles" "$use_randomization.p0_input" - #else - "0" "/dev/null" #end if #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) @@ -15,30 +26,44 @@ </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <conditional name="input_type"> + <param name="choice" type="select" format="integer" label="Input format"> + <option value="0" selected="true">gd_snp</option> + <option value="1">gd_genotype</option> + </param> + + <when value="0"> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + + <conditional name="data_source"> + <param name="choice" type="select" format="integer" label="Frequency metric"> + <option value="0">sequence coverage</option> + <option value="1" selected="true">estimated genotype</option> + </param> + + <when value="0"> + <param name="min_value" type="integer" min="1" value="1" label="Minimum total read count for a population" /> + </when> + + <when value="1"/> + </conditional> + </when> + + <when value="1"> + <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> + </when> + </conditional> + <param name="p1_input" type="data" format="gd_indivs" label="Population 1 individuals" /> <param name="p2_input" type="data" format="gd_indivs" label="Population 2 individuals" /> - <conditional name="data_source"> - <param name="ds_choice" type="select" format="integer" label="Frequency metric"> - <option value="0">sequence coverage</option> - <option value="1" selected="true">estimated genotype</option> - </param> - <when value="0"> - <param name="min_value" type="integer" min="1" value="1" label="Minimum total read count for a population" /> - </when> - <when value="1"> - <param name="min_value" type="integer" min="1" value="1" label="Minimum individual genotype quality" /> - </when> - </conditional> - <param name="discard_fixed" type="select" label="For SNPs that appear to be fixed across both populations"> <option value="0">retain</option> <option value="1" selected="true">delete</option> </param> <conditional name="use_randomization"> - <param name="ur_choice" type="select" format="integer" label="Use randomization"> + <param name="choice" type="select" format="integer" label="Use randomization"> <option value="0" selected="true">no</option> <option value="1">yes</option> </param> @@ -62,7 +87,7 @@ <param name="ds_choice" value="0" /> <param name="min_value" value="3" /> <param name="discard_fixed" value="1" /> - <param name="ur_choice" value="0" /> + <param name="choice" value="0" /> <output name="output" file="test_out/average_fst/average_fst.txt" /> </test> </tests> @@ -71,10 +96,11 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. The output dataset is in text_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _text: ./static/formatHelp.html#text .. _Dataset missing?: ./static/formatHelp.html @@ -85,7 +111,7 @@ The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to specify individuals from a SNP table. No individual can be in both populations. Other choices are as follows. -Frequency metric. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations. +Frequency metric. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele (if the table is in gd_snp format, but not with gd_genotype), or by adding the frequencies inferred from genotypes of individuals in the populations. After specifying the frequency metric, the user sets lower bounds on amount of data required at a SNP. For estimating the FST using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SNPs not meeting these lower bounds are ignored.
--- a/commits.log Wed May 22 15:58:18 2013 -0400 +++ b/commits.log Tue May 28 16:24:19 2013 -0400 @@ -1,3 +1,11 @@ + +:6255a0a7fad5 +cathy 2013-05-10 15:45 +Bumped version number of the Pathway Image tool. + +:45ed8c76cabf +cathy 2013-05-10 15:07 +Fix from Oscar to handle changes in the KEGG Mapper web form. :ea75d4a4ded0 cathy 2013-03-04 16:04
--- a/datatypes_conf.xml Wed May 22 15:58:18 2013 -0400 +++ b/datatypes_conf.xml Tue May 28 16:24:19 2013 -0400 @@ -7,6 +7,7 @@ <datatype extension="gd_indivs" type="galaxy.datatypes.wsf:Individuals" display_in_upload="true"/> <datatype extension="gd_ped" type="galaxy.datatypes.wsf:Wped" display_in_upload="true"/> <datatype extension="gd_snp" type="galaxy.datatypes.wsf:GDSnp" display_in_upload="true"/> + <datatype extension="gd_genotype" type="galaxy.datatypes.wsf:GDSnp" subclass="true" display_in_upload="true"/> <datatype extension="gd_sap" type="galaxy.datatypes.wsf:GDSap" display_in_upload="true"/> <datatype extension="gd_covered_cds" type="galaxy.datatypes.interval:Interval" subclass="true" display_in_upload="true"/> </registration>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diversity_pi.py Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import sys +import subprocess +from Population import Population + +def run_program(prog, args, stdout_file=None, space_to_tab=False): + #print "args: ", ' '.join(args) + p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = p.communicate() + rc = p.returncode + + if stdout_file is not None: + with open(stdout_file, 'w') as ofh: + lines = stdoutdata.split('\n') + for line in lines: + line = line.strip() + if line: + if space_to_tab: + line = line.replace(' ', '\t') + print >> ofh, line + + if rc != 0: + print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) + print >> sys.stderr, stderrdata + sys.exit(1) + +################################################################################ + +if len(sys.argv) < 7: + print >> sys.stderr, "Usage" + sys.exit(1) + +snp_input, coverage_input, indiv_input, min_coverage, output = sys.argv[1:6] +individual_metadata = sys.argv[6:] + +p_total = Population() +p_total.from_tag_list(individual_metadata) + +p1 = Population() +p1.from_population_file(indiv_input) +if not p_total.is_superset(p1): + print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' + sys.exit(1) + +################################################################################ + +prog = 'mt_pi' + +args = [ prog ] + +args.append(snp_input) +args.append(coverage_input) +args.append(min_coverage) + +for tag in p1.tag_list(): + args.append(tag) + +run_program(prog, args, stdout_file=output) +sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diversity_pi.xml Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,25 @@ +<tool id="gd_diversity_pi" name="Diversity" version="1.0.0"> + <description>&pi;</description> + + <command interpreter="python"> + diversity_pi.py "$input" "$coverage_input" "$indiv_input" "$min_coverage" "$output" + #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) + #set $arg = '%s:%s' % ($individual_col, $individual) + "$arg" + #end for + </command> + + <inputs> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <param name="coverage_input" type="data" format="interval" label="Coverage dataset" /> + <param name="indiv_input" type="data" format="gd_indivs" label="Population Individuals" /> + <param name="min_coverage" type="integer" min="1" value="1" label="Minimum coverage" /> + </inputs> + + <outputs> + <data name="output" format="txt" metadata_source="input" /> + </outputs> + + <help> + </help> +</tool>
--- a/dpmix.py Wed May 22 15:58:18 2013 -0400 +++ b/dpmix.py Tue May 28 16:24:19 2013 -0400 @@ -41,12 +41,12 @@ ################################################################################ -if len(sys.argv) < 15: +if len(sys.argv) < 16: print "usage" sys.exit(1) -input, data_source, switch_penalty, ap1_input, ap2_input, p_input, output, output2, output2_dir, dbkey, ref_column, galaxy_data_index_dir, heterochromatin_loc_file = sys.argv[1:14] -individual_metadata = sys.argv[14:] +input, input_type, data_source, switch_penalty, ap1_input, ap2_input, p_input, output, output2, output2_dir, dbkey, ref_column, galaxy_data_index_dir, heterochromatin_loc_file = sys.argv[1:15] +individual_metadata = sys.argv[15:] chrom = 'all' add_logs = '0' @@ -104,15 +104,24 @@ columns = ap1.column_list() for column in columns: - args.append('{0}:1:{1}'.format(column, ap1.individual_with_column(column).name)) + if input_type == 'gd_genotype': + args.append('{0}:1:{1}'.format(int(column) - 2, ap1.individual_with_column(column).name)) + else: + args.append('{0}:1:{1}'.format(column, ap1.individual_with_column(column).name)) columns = ap2.column_list() for column in columns: - args.append('{0}:2:{1}'.format(column, ap2.individual_with_column(column).name)) + if input_type == 'gd_genotype': + args.append('{0}:2:{1}'.format(int(column) - 2, ap2.individual_with_column(column).name)) + else: + args.append('{0}:2:{1}'.format(column, ap2.individual_with_column(column).name)) columns = p.column_list() for column in columns: - args.append('{0}:0:{1}'.format(column, p.individual_with_column(column).name)) + if input_type == 'gd_genotype': + args.append('{0}:0:{1}'.format(int(column) - 2, p.individual_with_column(column).name)) + else: + args.append('{0}:0:{1}'.format(column, p.individual_with_column(column).name)) run_program(None, args, stdout_file=output, space_to_tab=True)
--- a/dpmix.xml Wed May 22 15:58:18 2013 -0400 +++ b/dpmix.xml Tue May 28 16:24:19 2013 -0400 @@ -1,8 +1,14 @@ -<tool id="gd_dpmix" name="Admixture" version="1.0.0"> +<tool id="gd_dpmix" name="Admixture" version="1.1.0"> <description>: Map genomic intervals resembling specified ancestral populations</description> <command interpreter="python"> - dpmix.py "$input" "$data_source" "$switch_penalty" "$ap1_input" "$ap2_input" "$p_input" "$output" "$output2" "$output2.files_path" "$input.dataset.metadata.dbkey" "$input.dataset.metadata.ref" "$GALAXY_DATA_INDEX_DIR" "gd.heterochromatic.loc" + dpmix.py "$input" + #if $input_type.choice == '0' + "gd_snp" "$input_type.data_source" + #else if $input_type.choice == '1' + "gd_genotype" "1" + #end if + "$switch_penalty" "$ap1_input" "$ap2_input" "$p_input" "$output" "$output2" "$output2.files_path" "$input.dataset.metadata.dbkey" "$input.dataset.metadata.ref" "$GALAXY_DATA_INDEX_DIR" "gd.heterochromatic.loc" #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) "$arg" @@ -10,18 +16,32 @@ </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset"> - <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" /> - </param> + <conditional name="input_type"> + <param name="choice" type="select" format="integer" label="Input format"> + <option value="0" selected="true">gd_snp</option> + <option value="1">gd_genotype</option> + </param> + <when value="0"> + <param name="input" type="data" format="gd_snp" label="SNP dataset"> + <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" /> + </param> + + <param name="data_source" type="select" format="integer" label="Similarity metric"> + <option value="0">sequence coverage</option> + <option value="1" selected="true">estimated genotype</option> + </param> + </when> + <when value="1"> + <param name="input" type="data" format="gd_genotype" label="Genotype dataset"> + <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" /> + </param> + </when> + </conditional> + <param name="ap1_input" type="data" format="gd_indivs" label="Ancestral population 1 individuals" /> <param name="ap2_input" type="data" format="gd_indivs" label="Ancestral population 2 individuals" /> <param name="p_input" type="data" format="gd_indivs" label="Potentially admixed individuals" /> - <param name="data_source" type="select" format="integer" label="Similarity metric"> - <option value="0" selected="true">sequence coverage</option> - <option value="1">estimated genotype</option> - </param> - <param name="switch_penalty" type="integer" min="0" value="10" label="Genotype switch penalty" help="Note: typically between 10 and 100."/> </inputs> @@ -52,13 +72,14 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. It is important for +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. It is important for the Individuals datasets to have unique names and for there to be no overlap between the two populations. Rename these datasets if needed to make them unique. There are two output datasets, one tabular_ and one composite. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _tabular: ./static/formatHelp.html#tab .. _Dataset missing?: ./static/formatHelp.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/draw_variants.py Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import sys +import subprocess +from Population import Population + +def run_program(prog, args, stdout_file=None, space_to_tab=False): + #print "args: ", ' '.join(args) + p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = p.communicate() + rc = p.returncode + + if stdout_file is not None: + with open(stdout_file, 'w') as ofh: + lines = stdoutdata.split('\n') + for line in lines: + line = line.strip() + if line: + if space_to_tab: + line = line.replace(' ', '\t') + print >> ofh, line + + if rc != 0: + print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) + print >> sys.stderr, stderrdata + sys.exit(1) + +################################################################################ + +if len(sys.argv) < 10: + print >> sys.stderr, "Usage" + sys.exit(1) + +snp_input, indel_input, coverage_input, annotation_input, indiv_input, ref_name, min_coverage, output = sys.argv[1:9] +individual_metadata = sys.argv[9:] + +p_total = Population() +p_total.from_tag_list(individual_metadata) + +p1 = Population() +p1.from_population_file(indiv_input) +if not p_total.is_superset(p1): + print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' + sys.exit(1) + +################################################################################ + +prog = 'mk_Ji' + +args = [ prog ] + +args.append(snp_input) +args.append(indel_input) +args.append(coverage_input) +args.append(annotation_input) +args.append(min_coverage) +args.append(ref_name) + +for tag in p1.tag_list(): + args.append(tag) + +run_program(prog, args, stdout_file='mk_Ji.out') + +################################################################################ + +prog = 'varplot' + +args = [ prog ] +args.append('-w') +args.append('3') +args.append('-s') +args.append('0.3') +args.append('-g') +args.append('0.2') +args.append('mk_Ji.out') + +run_program(prog, args, stdout_file=output) +sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/draw_variants.xml Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,36 @@ +<tool id="gd_draw_variants" name="Draw" version="1.0.0"> + <description>variants</description> + + <command interpreter="python"> + draw_variants.py "$input" "$indel_input" "$coverage_input" "$annotation_input" "$indiv_input" "$ref_name" "$min_coverage" "$output" + #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) + #set $arg = '%s:%s' % ($individual_col, $individual) + "$arg" + #end for + </command> + + <inputs> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <param name="indel_input" type="data" format="gd_snp" label="Indel dataset" /> + <param name="coverage_input" type="data" format="interval" label="Coverage dataset" /> + <param name="annotation_input" type="data" format="interval" label="Annotation dataset" /> + <param name="indiv_input" type="data" format="gd_indivs" label="Population Individuals" /> + + <param name="ref_name" type="select" label="Ref name"> + <options from_dataset="indiv_input"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="add_value" name="default" value="default" index="0" /> + </options> + </param> + + <param name="min_coverage" type="integer" min="1" value="1" label="Minimum coverage" /> + </inputs> + + <outputs> + <data name="output" format="svg" /> + </outputs> + + <help> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/bin/varplot Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,464 @@ +#!/usr/bin/env python + +""" +Take a specification file and draw the Miller plot. + +The specification should have a header where each line starts with a "@". "@" +should be followed by a tag and value separated by a "=". Currently the only +defined tag is GL which is the genome length of the genome under consideration. + +The lines after the header should be one for each genome. The first column +should be the name of the individual/genome followed by the space separated +positions which need to be marked. + +An example spec file will be as follows: + +@GN=IR_3 +@GL=14574 +@GA=0:64::tRNA +@GA=64:1035:nad2:gene +@GA=1035:1100::tRNA +@GA=1092:1153::tRNA +@GA=1160:1226::tRNA +@GA=1218:2757:cox1:gene +@GA=2764:3440:cox2:gene +@GA=3440:3509::tRNA +@GA=3508:3574::tRNA +@GA=3574:3730:atp8:gene +@GA=3723:4389:atp6:gene +@GA=4395:5173:cox3:gene +@GA=5173:5236::tRNA +@GA=5236:5572:nad3:gene +@GA=5572:5633::tRNA +@GA=5632:5696::tRNA +@GA=5695:5763::tRNA +@GA=5765:5820::tRNA +@GA=5820:5885::tRNA +@GA=5883:5948::tRNA +@GA=5948:7617:nad5:gene +@GA=7617:7678::tRNA +@GA=7680:8997:nad4:gene +@GA=8990:9266:nad4L:gene +@GA=9268:9330::tRNA +@GA=9330:9395::tRNA +@GA=9397:9826:nad6:gene +@GA=9829:10910:cob:gene +@GA=10910:10976::tRNA +@GA=10993:11912:nad1:gene +@GA=11912:11978::tRNA +@GA=11992:12053::tRNA +@GA=12034:13289:rrnL:gene +@GA=12034:13289:16S:rRNA +@GA=13289:13351::tRNA +@GA=13351:14069:rrnS:gene +@GA=13351:14069:12S:rRNA +@GA=14423:14492::tRNA +@GA=14499:14569::tRNA +@CL=rRNA:#2B83BA +@CL=tRNA:#FFFFBF +@CL=gene:#D7191C +@CL=special:#000000 +@CL=indel:#FDAE61 +@CL=missing:#ABDDA4 +IR_65 2618 3267 3752 7768 8523 special=10177 10848 12790 13157 indel=3500:3560 +missing=4000:6000 +IR_66 2618 3267 3752 7768 8523 special=10177 10848 12790 13157 missing=4000:6000 +IR_63 2618 3267 3752 4883 8523 9798 10848 13157 missing=1:1000 +""" + +from sys import argv, stderr, exit +from getopt import getopt, GetoptError +from commands import getstatusoutput + +__author__ = "Aakrosh Ratan" +__email__ = "ratan@bx.psu.edu" + +# do we want the debug information to be printed? +debug_flag = False + +varstrokewidth = 1 + +# print the header for the image.. Inputs are in cm +def printheader(imagewidth, imageheight): + print "<?xml version=\"1.0\" standalone=\"no\"?>" + print "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"" + print "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">" + + print "<svg width=\"%dcm\"" % imagewidth + print "\theight=\"%dcm\"" % imageheight + print "\tviewBox=\"0 0 %d %d\"" % (100*imagewidth, 100*imageheight) + print "\txmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\">" + +# print the footer for the svg image.. Inputs are in cm +def printfooter(): + print "</svg>" + +# print a rectangle +def printrectangle(x, y, w, h, + sw = 2, + so = 1.0, + rx = None, + cp = None, + fl = "none", + fo = 1.0): +# print >> stderr, "Rectangle: %d %d %d %d" % (x, y, w, h) + print "<rect x=\"%d\"" % x + print "\ty=\"%d\"" % y + print "\twidth=\"%d\"" % w + print "\theight=\"%d\"" % h + if rx != None: print "\trx=\"%d\"" % rx + print "\tstroke=\"black\"" + if cp != None: print "\tclip-path=\"url(#g%d)\"" % cp + print "\tstroke-width=\"%d\"" % sw + print "\tstroke-opacity=\"%2.2f\"" % so + print "\tfill-opacity=\"%2.2f\"" % fo + print "\tfill=\"%s\" />" % fl + +def printtext(x, y, text, + fontfamily = "Times", + fontweight = "bold", + fontsize = "0.9cm", + fontvariant = "normal", + fill = "black", + dx = 0): +# print >> stderr, "Text: %d %d %s" % (x, y, text) + print "<text x=\"%d\"" % x + print "\tdx=\"%d\"" % dx + print "\ty=\"%d\"" % y + print "\tfill=\"%s\"" % fill + print "\tfont-family=\"%s\"" % fontfamily + print "\tfont-weight=\"%s\"" % fontweight + print "\tfont-size=\"%s\"" % fontsize + print "\tfont-variant=\"%s\">" % fontvariant + print "\t%s" % text + print "</text>" + +def printline(x1, y1, x2, y2, sw = 1, cp = None, sc = "black"): +# print >> stderr, "Line: %d %d %d %d" % (x1, y1, x2, y2) + print "<line x1=\"%d\"" % x1 + print "\ty1=\"%d\"" % y1 + print "\tx2=\"%d\"" % x2 + print "\ty2=\"%d\"" % y2 + print "\tstroke=\"%s\"" % sc + if cp != None: print "\tclip-path=\"url(#g%d)\"" % cp + print "\tstroke-width=\"%d\"/>" % sw + +def main(filename, cm2bp, eachplotheight, vgap): + file = open(filename, "r") + + # the genome length + genomelength = None + # the name of the genome + genomename = None + # the atrributes for the genome + attributes = [] + # the colors of the various attributes + colors = {} + # how much of the box should be filled + filltypes = {} + + # the variants that should be marked + variants = {} + # the order in which I want to display the names + names = [] + + # lets read the spec file and make sure the format is correct + for line in file: + if line.startswith("\n"): continue + + if line.startswith("#"): continue + + if line.startswith("@"): + tag,value = line.strip().split("=") + if tag[1:] == "GL": + genomelength = int(value) + elif tag[1:] == "GN": + genomename = value + elif tag[1:] == "GA": + tokens = value.split(":") + assert len(tokens) == 4 + attributes.append(tokens) + elif tag[1:] == "CL": + tokens = value.split(":") + filltype = "C" + if len(tokens) == 2: + name,color = tokens + elif len(tokens) == 3: + name,color,filltype = tokens + else: + print >> stderr, "Incorrect specification for colors" + exit(4) + colors[name] = color + assert filltype in ["C","L","U"], "color can include C,L,U attributes for the fillsizes" + filltypes[name] = filltype + else: + print >> stderr, "Undefined tag: %s" % tag + exit(5) + continue + + tokens = line.strip().split() + + name = tokens[0] + + positions = [] + specialpositions = [] + intervals = [] + for token in tokens[1:]: + if token.find("=") == -1: + positions.append(int(token)) + else: + type,position = token.split("=") + if position.find(":") == -1: + specialpositions.append((type, int(position))) + else: + s,e = position.split(":") + intervals.append((type, int(s), int(e))) + + variants[name] = (positions, specialpositions, intervals) + names.append(name) + file.close() + + # if genomename or genomelength is not specified, tell the user that it is + # necessary to do so + if genomename == None or genomelength == None: + print >> stderr, \ + "Please specify a tags for genomename (@GN) and genomelength (@GL)" + exit(6) + + # how much space would I need for the name + namelengths = [len(x) for x in names] + namelengths.append(len(genomename)) + namelength = max(namelengths) + + # gap between the name and the bar plots themselves (in cm) + hgap = 0.3 + + # the padding on the left side (in cm) + lpad = 0.5 + # the padding on the right (in cm) + rpad = 1.0 + # the padding on the top (in cm) + tpad = 0.5 + # the padding on the bottom (in cm) + bpad = 0.5 + + # convert cm into pt + cm2pt = 100 + + # so the width of the image is going to be : + # lpad + namelength + hgap + (genomelength/cm2bp) + rpad + # the image will be 1 cm wide for each cm2bp genome locations + # mf is the mulriplication factor to convert number of alphabets into cms. + mf = 0.20 + imagewidth = lpad + (namelength*mf) + hgap + (genomelength/cm2bp) + rpad + + # the height of the image is going to be + if len(attributes) == 0: + imageheight = tpad + (len(names) * (eachplotheight + vgap)) + bpad + else: + imageheight = tpad + ((len(names)+3) * (eachplotheight + vgap)) + bpad + + # start the image + printheader(imagewidth, imageheight) + + if debug_flag == True: + printrectangle(0, 0, imagewidth * cm2pt, imageheight * cm2pt) + printrectangle(0, 0, lpad * cm2pt, imageheight * cm2pt) + printrectangle(lpad*cm2pt, 0, namelength*mf*cm2pt, imageheight*cm2pt) + printrectangle((lpad+(namelength*mf))*cm2pt, 0, hgap*cm2pt,imageheight*cm2pt) + printrectangle((lpad+(namelength*mf)+hgap)*cm2pt, 0, genomelength*cm2pt/cm2bp, imageheight*cm2pt) + + # set up persistent variables in the documents + docstart = lpad * cm2pt + figstart = (lpad * cm2pt) + (namelength * mf * cm2pt) + (hgap * cm2pt) + figlen = genomelength * cm2pt / cm2bp + + # print the details for all the individuals + y = tpad * cm2pt + for index,name in enumerate(names): + if debug_flag == True: + printrectangle(0, y, imagewidth * cm2pt, eachplotheight * cm2pt) + + printtext(docstart, y + (eachplotheight * 85), name) + printrectangle(figstart, y, figlen, eachplotheight * cm2pt) + + # print vertical lines for the variants + positions = variants[name][0] + for position in positions: + x = figstart + (position * cm2pt / cm2bp) + printline(x, y, x, y + (eachplotheight * 100), sw = varstrokewidth) + + # print colored lines for special variants + specialpositions = variants[name][1] + for type,position in specialpositions: + x = figstart + (position * cm2pt / cm2bp) + h = eachplotheight * 100 + if filltypes[type] == "C": + printline(x, y, x, y + h, sc = colors[type], sw = 4) + elif filltypes[type] == "L": + printline(x, y + h/2, x, y + h, sc = colors[type], sw = 4) + elif filltypes[type] == "U": + printline(x, y, x, y + h/2, sc = colors[type], sw = 4) + + else: + print >> stderr, "Incorrect fillsize type specified" + exit(7) + + # print translucent rectangles for the missing regions and indels + intervals = variants[name][2] + for type,s,e in intervals: + s = int(s) + e = int(e) + x = figstart + (s * cm2pt / cm2bp) + w = (e - s) * cm2pt / cm2bp + h = eachplotheight * 100 + if filltypes[type] == "C": + printrectangle(x, y, w, h, sw=1, so=0.1, fl=colors[type]) + elif filltypes[type] == "L": + printrectangle(x, y + h/2, w, h/2, sw=1, so=0.1,fl=colors[type]) + elif filltypes[type] == "U": + printrectangle(x, y, w, h/2, sw=1, so=0.1, fl=colors[type]) + else: + print >> stderr, "Incorrect fillsize type specified" + exit(8) + + y += ((eachplotheight + vgap) * cm2pt) + + # print the attributes if we have any + if len(attributes) > 0: + if debug_flag == True: + printrectangle(0, y, imagewidth * cm2pt, eachplotheight * cm2pt) + + printtext(docstart, y + (eachplotheight * 85), genomename) + printrectangle(figstart, y, figlen, eachplotheight * cm2pt) + + # lets color the attributes as specified by the user + for index,(start,stop,name,group) in enumerate(attributes): + start = int(start) + stop = int(stop) + + x = figstart + (start * cm2pt / cm2bp) + printrectangle(x, + y, + (stop - start) * cm2pt / cm2bp, + eachplotheight * cm2pt, + fl = colors.get(group, "White")) + + # can I fit the name of the gene/feature in the colored area? + wordlen = len(name) * mf + if (wordlen * cm2pt) < ((stop - start) * cm2pt / cm2bp): + if group in colors: + color = "White" + else: + color = "Black" + + printtext(x, + y + (eachplotheight * 85), + name, + fill = color, + dx = (((stop-start)*cm2pt/cm2bp)-(wordlen*cm2pt))/2) + else: + # will this name fit at all, even at the bottom? Where is the + # next text label that I need to write? + tmpidx = index + 1 + while tmpidx < len(attributes) and \ + len(attributes[tmpidx][2]) == 0: + tmpidx += 1 + + if tmpidx < len(attributes): + nextstart = int(attributes[tmpidx][0]) + if ((wordlen*cm2pt) < ((nextstart-start) * cm2pt/cm2bp)): + printtext(x, + y + (eachplotheight + vgap) * cm2pt, + name, + colors.get(group, "Black")) + + y += ((eachplotheight + vgap) * cm2pt) + + # print the coordinates on a line + y += vgap * cm2pt + if debug_flag == True: + printrectangle(0, y, imagewidth * cm2pt, eachplotheight * cm2pt) + + printline(figstart, y, figstart + figlen, y) + + x = figstart + ticlength = 15 + for i in range(0, genomelength, 2000): + printline(x, y, x, y + ticlength) + printtext(x, y + ticlength + vgap * cm2pt, str(i), fontweight="normal") + x += (2000 * cm2pt / cm2bp) + printline(figstart + figlen, y, figstart + figlen, y + ticlength) + + # print the legend if there were attributes + if len(attributes) > 0: + if debug_flag == True: + printrectangle(0, y, imagewidth * cm2pt, eachplotheight * cm2pt) + + y += ((eachplotheight + 2 * vgap) * cm2pt) + x = figstart + + for name,color in colors.items(): + printtext(x, y, name, fontsize = "0.9cm") + x += ((len(name) + 1) * mf * cm2pt) + printrectangle(x, + y - eachplotheight * cm2pt + 10, + 100, + eachplotheight * cm2pt * 3 / 4, + fl = color) + x += 125 + + # end of the image + printfooter() + +def usage(): + f = stderr + print >> f, "usage:" + print >> f, "varplot [options] specfile" + print >> f, "where the options are:" + print >> f, "-h,--help : print usage and quit" + print >> f, "-d,--debug: print debug information" + print >> f, "-w,--strokewidth: stroke width for normal variants [1]" + print >> f, "-b,--eachplotheight : height of the plot for an individual (in cm) [0.4]" + print >> f, "-g,--eachplotgap : vertical gap between plots of different individuals (in cm) [0.4]" + +if __name__ == "__main__": + try: + opts, args = getopt(argv[1:],"hdw:s:g:",["help","debug","strokewidth=", "eachplotheight=", "eachplotgap="]) + except GetoptError, err: + print str(err) + usage() + exit(2) + + # number of bases to be drawn in 1 cm + cm2bp = 1000 + + # the strokewidth used to show simple SNPs + varstrokewidth = 1 + + # the height of the plot for each individual (in cm) + eachplotheight = 0.4 + + # vertical gap between plots of different individuals (in cm) + vgap = 0.4 + + for o, a in opts: + if o in ("-h", "--help"): + usage() + exit() + elif o in ("-d", "--debug"): + debug_flag = True + elif o in ("-w", "--strokewidth"): + varstrokewidth = int(a) + elif o in ("-s", "--eachplotheight"): + eachplotheight = float(a) + elif o in ("-g", "--eachplotgap"): + vgap = float(a) + else: + assert False, "unhandled option" + + if len(args) != 1: + usage() + exit(3) + + main(args[0], cm2bp, eachplotheight, vgap)
--- a/genome_diversity/src/Fst_ave.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/Fst_ave.c Tue May 28 16:24:19 2013 -0400 @@ -56,7 +56,7 @@ #include "Fst_lib.h" // maximum length of a line from the table -#define MOST 5000 +#define MOST 50000 // information about the specified individuals // x is an array of nI values 0, 1, or 2; @@ -213,7 +213,7 @@ n = col[i]; if (genotypes) { k = X[n+2]; - if (k == -1 || X[n+3] < lower_bound) + if (k == -1) new->c[i].A = new->c[i].B = -1; else { new->c[i].A = k; @@ -237,8 +237,8 @@ printf("Average Reich-Patterson FST is %5.5f.\n", F2 = F_reich); printf("The population-based Reich-Patterson Fst is %5.5f.\n", F3 = N_reich/D_reich); + printf("Average Weir-Cockerham FST is %5.5f.\n", F1 = F_weir); printf("Average Wright FST is %5.5f.\n", F0 = F_wright); - printf("Average Weir-Cockerham FST is %5.5f.\n", F1 = F_weir); if (nfail > 0) printf("WARNING: %d of %d FSTs could not be computed\n", nfail, 3*nsnp);
--- a/genome_diversity/src/Fst_column.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/Fst_column.c Tue May 28 16:24:19 2013 -0400 @@ -51,7 +51,7 @@ #include "Fst_lib.h" // most characters allowed in a row of the table -#define MOST 5000 +#define MOST 50000 // column and population for the relevant individuals/groups int col[MOST], pop[MOST]; @@ -104,7 +104,11 @@ i < nI; ++i) { n = col[i]; g = X[n+2]; // save genotype - if ((genotypes && g == -1) || X[n+3] < min_qual) + + if (genotypes) { + if (g == -1) + continue; + } else if (X[n+3] < min_qual) continue; if (pop[i] == 1) { // column n (base 1) corresponds to entry X[n] @@ -131,7 +135,7 @@ } if (discard && ((A1 == 0 && A2 == 0) || (B1 == 0 && B2 == 0))) continue; // not variable in the two populations - if (x1+y1 < min_cov || x2+y2 < min_cov) + if (!genotypes && (x1+y1 < min_cov || x2+y2 < min_cov)) F = -1.0; else { if (unbiased == 0)
--- a/genome_diversity/src/Makefile Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/Makefile Tue May 28 16:24:19 2013 -0400 @@ -5,7 +5,7 @@ INSTALL_DIR = ../bin TARGETS = admix_prep aggregate coords2admix coverage dist_mat dpmix \ - eval2pct filter_snps Fst_ave Fst_column get_pi sweep + eval2pct filter_snps Fst_ave Fst_column get_pi mk_Ji mt_pi sweep all: $(TARGETS) @@ -46,6 +46,12 @@ get_pi: get_pi.c lib.c $(CC) $(CFLAGS) $^ -o $@ +mk_Ji: mk_Ji.c lib.c mito_lib.c + $(CC) $(CWARN) $^ -o $@ + +mt_pi: mt_pi.c lib.c mito_lib.c + $(CC) $(CWARN) $^ -o $@ + sweep: sweep.c lib.c Huang.c $(CC) $(CFLAGS) $^ -o $@
--- a/genome_diversity/src/admix_prep.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/admix_prep.c Tue May 28 16:24:19 2013 -0400 @@ -17,7 +17,7 @@ #include "lib.h" // bounds line length for a line of the Galaxy table -#define MOST 5000 +#define MOST 50000 struct individual { int column; char *name;
--- a/genome_diversity/src/aggregate.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/aggregate.c Tue May 28 16:24:19 2013 -0400 @@ -11,7 +11,7 @@ #include "lib.h" // most characters allowed in a row of the table -#define MOST 5000 +#define MOST 50000 // column for the relevant individuals/groups int col[MOST];
--- a/genome_diversity/src/coverage.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/coverage.c Tue May 28 16:24:19 2013 -0400 @@ -17,10 +17,10 @@ #include "lib.h" // maximum length of a line from the table -#define MOST 5000 +#define MOST 50000 // the largest coverage or quality value being considered -#define MAX_VAL 1000 +#define MAX_VAL 5000 FILE *gp; // for text output
--- a/genome_diversity/src/dist_mat.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/dist_mat.c Tue May 28 16:24:19 2013 -0400 @@ -21,7 +21,7 @@ // bounds line length for a line of the Galaxy table -#define MOST 5000 +#define MOST 50000 #define MIN_SNPS 3 struct argument { @@ -30,7 +30,7 @@ } A[MOST]; int nA; // number of individuals or groups + 1 (for the reference species) -#define MOST_INDIVIDUALS 100 +#define MOST_INDIVIDUALS 1000 #define SIZ 1+MOST_INDIVIDUALS // includes the reference double tot_diff[SIZ][SIZ]; @@ -48,7 +48,8 @@ fatal("args: Galaxy-table min-cov min-qual min-snp ref-name genotype dist-out mega-out 13:fred 16:mary ..."); min_coverage = atoi(argv[2]); min_quality = atoi(argv[3]); - if (min_coverage <= 0 && min_quality <= 0) + genotype = atoi(argv[5]); + if (!genotype && min_coverage <= 0 && min_quality <= 0) fatal("coverage and/or quality of SNPs should be constrained"); if (same_string(argv[4], "none")) @@ -57,7 +58,6 @@ has_ref = 1; A[0].name = copy_string(argv[4]); } - genotype = atoi(argv[5]); gp = ckopen(argv[6], "w"); mega = ckopen(argv[7], "w"); fprintf(mega, "#mega\n!Title: Galaxy;\n");
--- a/genome_diversity/src/dpmix.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/dpmix.c Tue May 28 16:24:19 2013 -0400 @@ -27,7 +27,7 @@ //#include <math.h> // maximum length of a line from the table -#define MOST 5000 +#define MOST 50000 // we create a linked list of "events" on a chromosome -- mostly SNPs, but // also ends of hetorochomatic intervals
--- a/genome_diversity/src/filter_snps.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/filter_snps.c Tue May 28 16:24:19 2013 -0400 @@ -16,7 +16,7 @@ #include "lib.h" // most characters allowed in a row of the table -#define MOST 5000 +#define MOST 50000 char buf[MOST];
--- a/genome_diversity/src/get_pi.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/get_pi.c Tue May 28 16:24:19 2013 -0400 @@ -24,7 +24,7 @@ // END_FILE should be larger than any real scaffold number #define END_FILE 1000000000 // scaffold "number" signifying end-of-file -#define BUF_SIZE 10000 // maximum length of a SNP-file row +#define BUF_SIZE 50000 // maximum length of a SNP-file row int col[10000], nC; // columns containing the chosen genotypes
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/mito_lib.c Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,98 @@ +// mito_data.c -- shared procedures to read SNP and coverage file for +// mitogenomes + +#include "lib.h" +#include "mito_lib.h" + +#define ADHOC + +// get the adequately covered intervals for each specified individual; +// merge adjacent intervals +void get_intervals(char *filename) { + FILE *fp = ckopen(filename, "r"); + char buf[500], name[100]; + struct interv *p, *new; + int i, b, e, cov; + + while (fgets(buf, 500, fp)) { + if (sscanf(buf, "%*s %d %d %d %s", &b, &e, &cov, name) != 4) + fatalf("interval: %s", buf); + if (cov < min_cov) + continue; + for (i = 0; i < nM && !same_string(M[i].name, name); ++i) + ; + if (i == nM) + continue; + if (M[i].last != NULL && M[i].last->e == b) { + // merge with adjacent interval + M[i].last->e = e; + continue; + } + new = ckalloc(sizeof(*new)); + new->b = b; + new->e = e; + new->next = NULL; + if ((p = M[i].last) == NULL) + M[i].intervals = new; + else + p->next = new; + M[i].last = new; + } + fclose(fp); +/* + for (i = 0; i < nM; ++i) { + printf("%s:", M[i].name); + for (p = M[i].intervals; p; p = p->next) + printf(" %d-%d", p->b, p->e); + putchar('\n'); + } +*/ +} + +// get the SNPs; for each SNP set the array of (first characters from) +// genotypes of the specified samples (individuals) +int get_variants(char *filename, struct snp *S, int refcol) { + FILE *fp = ckopen(filename, "r"); + char buf[5000], *s, *f[101], *z = " \t\n\0"; + int i, n, c; + + for (i = 0; i <= 100; ++i) + f[i] = NULL; + for (n = 0; fgets(buf, 500, fp); ++n) { + if (buf[0] == '#') { + --n; + continue; + } + if (n >= MAX_SNP) + fatal("too many SNPs"); + if (sscanf(buf, "%*s %d", &(S[n].pos)) != 1) + fatalf("pos : %s", buf); + S[n].g = ckalloc((nM+1)*(sizeof(char))); + S[n].g[nM] = '\0'; + for (i = 0; i <= 100; ++i) + if (f[i] != NULL) + free(f[i]); + for (i = 1, s = strtok(buf, z); s; s = strtok(NULL, z), ++i) + f[i] = copy_string(s); + for (i = 0; i < nM; ++i) { + // genotype is 2 columns past the individual's 1st + // column + // AD HOC RULE: IF THERE IS ONE READ OF EACH ALLELE, + // THEN IGNORE THE SNP. + c = M[i].col; + if (refcol == 0) + S[n].g[i] = f[c+2][0]; + else if (same_string(f[refcol+2], f[c+2])) + S[n].g[i] = '2'; + else + S[n].g[i] = '0'; +#ifdef ADHOC + if (same_string(f[c], "1") && + same_string(f[c+1], "1")) + S[n].g[i] = '-'; +#endif + } + } + fclose(fp); + return n; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/mito_lib.h Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,31 @@ +// mito_data.h -- header file for shared procedures to read SNPs and intervals +// for mitogenomes + +#define MAX_SNP 20000 +#define MAX_SAMPLE 100 +struct snp { + int pos; + char *g; // genotypes - one character per specified mitogenome +} S[MAX_SNP], I[MAX_SNP]; + +// intervals associated with each specified mitogenome +struct interv { + int b, e; + struct interv *next; +}; +int nM, min_cov, debug; + +// mitogenomes +struct mito { + char *name; + int col; // first column in the SNP table + struct interv *intervals, *last; +} M[MAX_SAMPLE]; + +// get the adequately covered intervals for each specified individual; +// merge adjacent intervals +void get_intervals(char *filename); + +// get the SNPs; for each SNP set the array of (first characters from) +// genotypes of the specified samples (individuals) +int get_variants(char *filename, struct snp *S, int refcol);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/mk_Ji.c Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,147 @@ +/* mk_Ji -- prepare data for drawing a picture of mitogenome data +* +* argv[1] -- SNP table for the mitogenome +* +* argv[2] -- indel table for the mitogenome +* +* argv[3] -- coverage table: file of intervals with lines like: + +P.ingens-mt 175 194 6 9-M-352 + +* giving genome name, start postion (base-0), end position (half open), +* coverage and sample name. +* +* argv[4] -- annotation file like + +P.ingens-mt 0 70 tRNA + tRNA-Phe +P.ingens-mt 70 1030 rRNA + 12S +P.ingens-mt 1030 1100 tRNA + tRNA-Val +P.ingens-mt 1101 2680 CDS + rRNA +P.ingens-mt 1101 2680 rRNA + 16S +P.ingens-mt 2680 2755 tRNA + tRNA-Leu +P.ingens-mt 2758 3713 CDS + ND1 +... +P.ingens-mt 15484 16910 D-loop + D-loop + +* argv[5] -- the minimum coverage. Intervals of lower coverage are ignored. +* +* argv[6] -- either the string "default" or the name of an individual +* +* argv[7], argv[8], ... column:name pairs like "9:sam". +* +* Also, if the last argument is "debug", then much output sent to stderr. +*/ + +#include "lib.h" +#include "mito_lib.h" + +int ref_len; + +// read gene annotation, change "CDS" to "gene", print for the drawing tool, +// print lines showing the genome name and length (last annotated position). +void get_genes(char *filename) { + FILE *fp = ckopen(filename, "r"); + char buf[500], ref[100], type[100], name[100], *t; + int b, e; + + while (fgets(buf, 500, fp)) { + if (sscanf(buf, "%s %d %d %s %*s %s", + ref, &b, &e, type, name) != 5) + fatalf("gag: %s", buf); + t = (same_string(type, "CDS") ? "gene" : type); + // print the Genome Annotation line + printf("@GA=%d:%d:%s:%s\n", b, e, name, t); + } + printf("@GL=%d\n", ref_len = e); + printf("@GN=%s\n", ref); +} + +// print items that are adequately covered +void visible(int i, struct snp *S, int nS, char *s) { + struct interv *t; + int j; + + for (j = 0, t = M[i].intervals; j < nS; ++j) { + while (t && t->e <= S[j].pos) + t = t->next; + if (t && t->b <= S[j].pos && S[j].g[i] == '0') + printf(" %s%d", s, S[j].pos); + } +} + +int main(int argc, char **argv) { + struct interv *t; + int i, nS, nI, last_e, refcol; + char *a, *s; + + if (argc > 7 && same_string(argv[argc-1], "debug")) { + --argc; + debug = 1; + } + + if (argc < 7) + fatal("args: snps indels intervals genes min_cov ref 9:sam 13:judy ... "); + + // store sample names and start positions (argv[6], argv[7], ...) + for (nM = 0, i = 7; i < argc; ++nM, ++i) { + if (nM >= MAX_SAMPLE) + fatalf("Too many mitogenomes"); + if ((s = strchr(a = argv[i], ':')) == NULL) + fatalf("colon: %s", a); + M[nM].col = atoi(a); + M[nM].name = copy_string(s+1); + } + if (same_string(argv[6], "default")) + refcol = 0; + else { + for (i = 0; i < nM && !same_string(argv[6], M[i].name); ++i) + ; + if (i == nM) + fatalf("improper reference name '%s'", argv[6]); + refcol = M[i].col; + // fprintf(stderr, "refcol = %d\n", refcol); + } + + // read annotation and annotate in the file for drawing + get_genes(argv[4]); + + // record color information + printf("@CL=rRNA:#EF8A62\n@CL=tRNA:#31A354\n@CL=gene:#B2182B\n"); + printf("@CL=missing:#67A9CF:L\n@CL=indel:#2166AC\n@CL=special:#000000\n"); + + min_cov = atoi(argv[5]); + + // store the coverage data + get_intervals(argv[3]); + + if (debug) { + for (i = 0; i < nM; ++i) { + fprintf(stderr, ">%s\n", M[i].name); + for (t = M[i].intervals; t; t = t->next) + fprintf(stderr, "%d\t%d\n", t->b, t->e); + } + putc('\n', stderr); + } + + // record the variants + nS = get_variants(argv[1], S, refcol); + nI = get_variants(argv[2], I, refcol); + + // report the information for each sample + for (i = 0; i < nM; ++i) { + printf("%s", M[i].name); + visible(i, S, nS, ""); + visible(i, I, nI, "indel="); + last_e = 0; + for (t = M[i].intervals; t; t = t->next) { + if (last_e < t->b) + printf(" missing=%d:%d", last_e, t->b); + last_e = t->e; + } + if (last_e < ref_len) + printf(" missing=%d:%d", last_e, ref_len); + putchar('\n'); + } + + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genome_diversity/src/mt_pi.c Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,164 @@ +/* mt_pi -- compute the diversity measure pi for mitochondrial genomes +* [SHOULD I OPTIONALLY INCLUDE INDELS?] +* +* argv[1] -- SNP table for the mitogenome +* +* argv[2] -- file of intervals with lines like: + +P.ingens-mt 175 194 6 9-M-352 + +* giving genome name, start postion (base-0), end position (half open), +* coverage and sample name. +* +* argv[3] -- the minimum coverage. Intervals of lower coverage are ignored. +* +* argv[4], argv[5], ... column:name pairs like "9:sam". +* +* Also, if the last argument is "debug", then much output sent to stderr, if it +* is "debug2", then the coverage and difference-count for each mitogenome-pair +* is sent to stderr. +*/ + +#include "lib.h" +#include "mito_lib.h" + +int debug2; + +// for a pair of samples, determine how much of the reference is in the +// adequately covered segments for both, and count the number of SNPs in those +// shared regions where they differ +// PUTATIVE HETEROPLASMIES ARE IGNORED +float pair(int i, int j, int nS) { + int covered, B, E, diffs, k; + struct interv *p = M[i].intervals, *q = M[j].intervals; + char x, y; + + // k scans the SNPs + covered = diffs = k = 0; + while (p && q) { + if (debug) + fprintf(stderr, "trying %d-%d and %d-%d\n", + p->b, p->e, q->b, q->e); + // take the intersection of the two well-covered intervals + B = MAX(p->b, q->b); + E = MIN(p->e, q->e); + if (B < E) { + if (debug) + fprintf(stderr, " covered %d\n", E-B); + covered += (E - B); + for ( ; k < nS && S[k].pos < E; ++k) { + if (S[k].pos >= B) { + x = S[k].g[i]; + y = S[k].g[j]; + if (debug) + fprintf(stderr, + " SNP %c %c at %d\n", + x, y, S[k].pos); +/* + if (x == '-' || y == '-') + fatalf("SNP at %d missing geno", + S[k].pos); +*/ +/* + if (x == '1' || y == '1') + continue; +*/ + if (x != y) { + ++diffs; + if (debug) + fprintf(stderr, + "\tdiff at %d\n", + S[k].pos); + } + } + } + } + // go to next adequately covered interval(s) + if (p->e < q->e) + p = p->next; + else if (p->e > q->e) + q = q->next; + else { + p = p->next; + q = q->next; + } + } + if (debug2) + fprintf(stderr, "cov(%s,%s) = %d, diffs = %d\n", + M[i].name, M[j].name, covered, diffs); +/* + if (covered == 0) + fatalf("coverage threshold is too high for %s and %s", + M/[i].name, M[j].name); +*/ + if (covered == 0) + return -1.0; + return (float)diffs/(float)covered; +} + +int main(int argc, char **argv) { + struct interv *t; + int i, j, nS, good_pairs, bad_pairs; + char *a, *s; + float tot, pr; + + if (argc > 4 && same_string(argv[argc-1], "debug")) { + --argc; + debug = debug2 = 1; + } else if (argc > 4 && same_string(argv[argc-1], "debug2")) { + --argc; + debug2 = 1; + } + + if (argc < 5) + fatal("args: snps intervals min_cov 9:sam 13:judy ... "); + // store sample names and start positions (argv[4], argv[5], ...) + for (nM = 0, i = 4; i < argc; ++nM, ++i) { + if (nM >= MAX_SAMPLE) + fatalf("Too many mitogenomes"); + if ((s = strchr(a = argv[i], ':')) == NULL) + fatalf("colon: %s", a); + M[nM].col = atoi(a); + M[nM].name = copy_string(s+1); + } + min_cov = atoi(argv[3]); + get_intervals(argv[2]); + + if (debug) { + for (i = 0; i < nM; ++i) { + fprintf(stderr, ">%s\n", M[i].name); + for (t = M[i].intervals; t; t = t->next) + fprintf(stderr, "%d\t%d\n", t->b, t->e); + } + putc('\n', stderr); + } + + // record the SNPs + nS = get_variants(argv[1], S, 0); + + if (debug) { + for (i = 0; i < nS; ++i) + fprintf(stderr, "%d %s\n", S[i].pos, S[i].g); + putc('\n', stderr); + } + + // record the total rate of diversity, over all pairs of individuals + // having overlapping well-covered intervals + good_pairs = bad_pairs = 0; + for (i = 0, tot = 0.0; i < nM; ++i) { + for (j = i+1; j < nM; ++j) { + pr = pair(i, j, nS); + if (pr >= 0.0) { + ++good_pairs; + tot += pr; + } else + ++bad_pairs; + } + } + printf("pi = %5.5f\n", tot/(float)good_pairs); + if (bad_pairs > 0) + printf("%d of %d pairs had no sequenced regions in common\n", + bad_pairs, bad_pairs + good_pairs); + + return 0; +}
--- a/genome_diversity/src/sweep.c Wed May 22 15:58:18 2013 -0400 +++ b/genome_diversity/src/sweep.c Tue May 28 16:24:19 2013 -0400 @@ -28,7 +28,7 @@ // maximum number of rows in any processed table #define MANY 20000000 -#define BUF_SIZE 5000 +#define BUF_SIZE 50000 #define MAX_WINDOW 1000000 double X[MANY]; // holds all scores
--- a/pca.py Wed May 22 15:58:18 2013 -0400 +++ b/pca.py Tue May 28 16:24:19 2013 -0400 @@ -7,6 +7,7 @@ import sys from BeautifulSoup import BeautifulSoup import gd_composite +import re ################################################################################ @@ -62,6 +63,8 @@ def make_ind_file(ind_file, input): pops = [] + name_map = [] + name_idx = 0 ofh = open(ind_file, 'w') @@ -78,11 +81,13 @@ individuals = entry.ol('li') for individual in individuals: individual_name = individual.string.encode('utf8').strip() - print >> ofh, individual_name, 'M', population_name + name_map.append(individual_name) + print >> ofh, 'ind_%s' % name_idx, 'M', population_name + name_idx += 1 i += 1 ofh.close() - return pops + return pops, name_map def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): with open(par_file, 'w') as fh: @@ -175,6 +180,26 @@ shutil.copy2('fake', coords_file) +ind_regex = re.compile('ind_([0-9]+)') + +def fix_names(name_map, files): + for file in files: + tmp_filename = '%s.tmp' % file + with open(tmp_filename, 'w') as ofh: + with open(file) as fh: + for line in fh: + line = line.rstrip('\r\n') + match = ind_regex.search(line) + if match: + idx = int(match.group(1)) + old = 'ind_%s' % idx + new = name_map[idx].replace(' ', '_') + line = line.replace(old, new) + print >> ofh, line + + shutil.copy2(tmp_filename, file) + os.unlink(tmp_filename) + ################################################################################ if len(sys.argv) != 5: @@ -194,7 +219,7 @@ do_map2snp(map_file, snp_file) ind_file = os.path.join(output_files_path, 'admix.ind') -population_names = make_ind_file(ind_file, input) +population_names, name_map = make_ind_file(ind_file, input) par_file = os.path.join(output_files_path, 'par.admix') evec_file = os.path.join(output_files_path, 'coordinates.txt') @@ -202,6 +227,7 @@ make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) smartpca_stats = do_smartpca(par_file) +fix_names(name_map, [ind_file, evec_file]) do_ploteig(evec_file, population_names) plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))
--- a/phylogenetic_tree.py Wed May 22 15:58:18 2013 -0400 +++ b/phylogenetic_tree.py Tue May 28 16:24:19 2013 -0400 @@ -19,22 +19,102 @@ ################################################################################ -if len(sys.argv) < 11: - print >> sys.stderr, "Usage" +# <command interpreter="python"> +# phylogenetic_tree.py "$input" "$output" "$output.files_path" +# +# #if $input_type.choice == '0' +# "gd_snp" +# #if $input_type.data_source.choice == '0' +# "sequence_coverage" +# "$input_type.data_source.minimum_coverage" +# "$input_type.data_source.minimum_quality" +# #else if $input_type.data_source.choice == '1' +# "estimated_genotype" +# #else if $input_type.choice == '1' +# "gd_genotype" +# #end if +# +# #if $individuals.choice == '0' +# "all_individuals" +# #else if $individuals.choice == '1' +# "$individuals.p1_input" +# #end if +# +# #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0') +# "none" +# #else +# "$input.metadata.dbkey" +# #end if +# +# #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style]) +# #if $draw_tree_options == '' +# "" +# #else +# "-$draw_tree_options" +# #end if +# +# #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) +# #set $arg = '%s:%s' % ($individual_col, $individual_name) +# "$arg" +# #end for +# </command> + +################################################################################ + +# if len(sys.argv) < 11: +# print >> sys.stderr, "Usage" +# sys.exit(1) +# +# input, p1_input, output, extra_files_path, minimum_coverage, minimum_quality, dbkey, data_source, draw_tree_options = sys.argv[1:10] +# +# individual_metadata = sys.argv[10:] +# +# # note: TEST THIS +# if dbkey in ['', '?', 'None']: +# dbkey = 'none' +# +# p_total = Population() +# p_total.from_tag_list(individual_metadata) + +if len(sys.argv) < 5: + print >> sys.stderr, 'Usage' sys.exit(1) -input, p1_input, output, extra_files_path, minimum_coverage, minimum_quality, dbkey, data_source, draw_tree_options = sys.argv[1:10] +input, output, extra_files_path, input_type = sys.argv[1:5] +args = sys.argv[5:] + +data_source = '1' +minimum_coverage = '0' +minimum_quality = '0' -individual_metadata = sys.argv[10:] +if input_type == 'gd_snp': + data_source_arg = args.pop(0) + if data_source_arg == 'sequence_coverage': + data_source = '0' + minimum_coverage = args.pop(0) + minimum_quality = args.pop(0) + elif data_source_arg == 'estimated_genotype': + pass + else: + print >> sys.stderr, 'Unsupported data_source:', data_source_arg + sys.exit(1) +elif input_type == 'gd_genotype': + pass +else: + print >> sys.stderr, 'Unsupported input_type:', input_type + sys.exit(1) + +p1_input, dbkey, draw_tree_options = args[:3] # note: TEST THIS if dbkey in ['', '?', 'None']: dbkey = 'none' +individual_metadata = args[3:] + p_total = Population() p_total.from_tag_list(individual_metadata) - ################################################################################ mkdir_p(extra_files_path) @@ -88,6 +168,9 @@ tags = p1.tag_list() for tag in tags: + if input_type == 'gd_genotype': + column, name = tag.split(':') + tag = '{0}:{1}'.format(int(column) - 2, name) args.append(tag) fh = open(phylip_outfile, 'w')
--- a/phylogenetic_tree.xml Wed May 22 15:58:18 2013 -0400 +++ b/phylogenetic_tree.xml Tue May 28 16:24:19 2013 -0400 @@ -1,26 +1,41 @@ -<tool id="gd_phylogenetic_tree" name="Phylogenetic Tree" version="1.0.0"> +<tool id="gd_phylogenetic_tree" name="Phylogenetic Tree" version="1.1.0"> <description>: Show genetic relationships among individuals</description> <command interpreter="python"> - phylogenetic_tree.py "$input" + phylogenetic_tree.py "$input" "$output" "$output.files_path" + + #if $input_type.choice == '0' + "gd_snp" + #if $input_type.data_source.choice == '0' + "sequence_coverage" + "$input_type.data_source.minimum_coverage" + "$input_type.data_source.minimum_quality" + #else if $input_type.data_source.choice == '1' + "estimated_genotype" + #end if + #else if $input_type.choice == '1' + "gd_genotype" + #end if + #if $individuals.choice == '0' "all_individuals" #else if $individuals.choice == '1' - "$p1_input" + "$individuals.p1_input" #end if - "$output" "$output.files_path" "$minimum_coverage" "$minimum_quality" + #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0') "none" #else "$input.metadata.dbkey" #end if - "$data_source" + #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style]) #if $draw_tree_options == '' "" #else "-$draw_tree_options" #end if + #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual_name) "$arg" @@ -28,7 +43,31 @@ </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <conditional name="input_type"> + <param name="choice" type="select" format="integer" label="Input format"> + <option value="0" selected="true">gd_snp</option> + <option value="1">gd_genotype</option> + </param> + <when value="0"> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + + <conditional name="data_source"> + <param name="choice" type="select" format="integer" label="Distance metric"> + <option value="0">sequence coverage</option> + <option value="1" selected="true">estimated genotype</option> + </param> + <when value="0"> + <param name="minimum_coverage" type="integer" min="0" value="0" label="Minimum SNP coverage" /> + <param name="minimum_quality" type="integer" min="0" value="0" label="Minimum SNP quality" + help="Note: minimum coverage and minimum quality cannot both be 0" /> + </when> + <when value="1"/> + </conditional> + </when> + <when value="1"> + <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> + </when> + </conditional> <conditional name="individuals"> <param name="choice" type="select" label="Compute for"> @@ -41,21 +80,11 @@ </when> </conditional> - <param name="minimum_coverage" type="integer" min="0" value="0" label="Minimum SNP coverage" /> - - <param name="minimum_quality" type="integer" min="0" value="0" label="Minimum SNP quality" - help="Note: minimum coverage and minimum quality cannot both be 0" /> - <param name="include_reference" type="select" format="integer" label="Include reference sequence"> <option value="1" selected="true">Yes</option> <option value="0">No</option> </param> - <param name="data_source" type="select" format="integer" label="Distance metric"> - <option value="0" selected="true">sequence coverage</option> - <option value="1">estimated genotype</option> - </param> - <param name="branch_style" type="select" display="radio"> <label>Branch type</label> <option value="" selected="true">square</option> @@ -110,12 +139,13 @@ **Dataset formats** -The input dataset is in gd_snp_ format. +The input dataset is in gd_snp_ or gd_genotype_ format. The output is a composite dataset, containing the tree in both text (Newick_) and PostScript formats, as well as supplemental text information. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _Newick: http://evolution.genetics.washington.edu/phylip/newicktree.html .. _Dataset missing?: ./static/formatHelp.html
--- a/prepare_population_structure.py Wed May 22 15:58:18 2013 -0400 +++ b/prepare_population_structure.py Tue May 28 16:24:19 2013 -0400 @@ -53,12 +53,12 @@ ################################################################################ -if len(sys.argv) < 9: +if len(sys.argv) < 10: die("Usage") # parse command line -input_snp_filename, min_reads, min_qual, min_spacing, output_filename, output_files_path = sys.argv[1:7] -args = sys.argv[7:] +input_snp_filename, input_type, min_reads, min_qual, min_spacing, output_filename, output_files_path = sys.argv[1:8] +args = sys.argv[8:] individual_metadata = [] population_files = [] @@ -119,6 +119,9 @@ tags = p1.tag_list() for tag in tags: + if input_type == 'gd_genotype': + column, name = tag.split(':') + tag = '{0}:{1}'.format(int(column) - 2, name) args.append(tag) #print "args:", ' '.join(args)
--- a/prepare_population_structure.xml Wed May 22 15:58:18 2013 -0400 +++ b/prepare_population_structure.xml Tue May 28 16:24:19 2013 -0400 @@ -1,8 +1,14 @@ -<tool id="gd_prepare_population_structure" name="Prepare Input" version="1.0.0"> +<tool id="gd_prepare_population_structure" name="Prepare Input" version="1.1.0"> <description>: Filter and convert to the format needed for these tools</description> <command interpreter="python"> - prepare_population_structure.py "$input" "$min_reads" "$min_qual" "$min_spacing" "$output" "$output.files_path" + prepare_population_structure.py "$input" + #if $input_type.choice == '0' + "gd_snp" "$input_type.min_reads" "$input_type.min_qual" + #else if $input_type.choice == '1' + "gd_genotype" "0" "0" + #end if + "$min_spacing" "$output" "$output.files_path" #if $individuals.choice == '0' "all_individuals" #else if $individuals.choice == '1' @@ -18,7 +24,21 @@ </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <conditional name="input_type"> + <param name="choice" type="select" format="integer" label="Input format"> + <option value="0" selected="true">gd_snp</option> + <option value="1">gd_genotype</option> + </param> + <when value="0"> + <param name="input" type="data" format="gd_snp" label="SNP dataset" /> + <param name="min_reads" type="integer" min="0" value="0" label="Minimum SNP coverage" /> + <param name="min_qual" type="integer" min="0" value="0" label="Minimum SNP quality" /> + </when> + <when value="1"> + <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> + </when> + </conditional> + <conditional name="individuals"> <param name="choice" type="select" label="Individuals"> <option value="0" selected="true">All individuals</option> @@ -31,8 +51,7 @@ </repeat> </when> </conditional> - <param name="min_reads" type="integer" min="0" value="0" label="Minimum SNP coverage" /> - <param name="min_qual" type="integer" min="0" value="0" label="Minimum SNP quality" /> + <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" /> </inputs> @@ -62,10 +81,11 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. The output dataset is in gd_ped_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _gd_ped: ./static/formatHelp.html#gd_ped .. _Dataset missing?: ./static/formatHelp.html
--- a/rank_terms.py Wed May 22 15:58:18 2013 -0400 +++ b/rank_terms.py Tue May 28 16:24:19 2013 -0400 @@ -1,132 +1,132 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GOFisher.py -# -# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, -# MA 02110-1301, USA. - -import argparse -import os -import sys -from fisher import pvalue -from decimal import Decimal,getcontext - -def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): - """ - """ - dGOTENSEMBLT={} - for eachl in open(inExtnddfile,'r'): - if eachl.strip(): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] - GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) - GOTs=GOTs.difference(set(['','U','N'])) - for GOT in GOTs: - try: - dGOTENSEMBLT[GOT].add(ENSEMBLT) - except: - dGOTENSEMBLT[GOT]=set([ENSEMBLT]) - #~ - ##dGOTENSEMBLT.pop('') - ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) - return dGOTENSEMBLT,ENSEMBLTGinGO - -def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): - """ - returns a set of the ENSEMBLT codes present in the input list and - in the GO file - """ - sENSEMBLTSAPsinGO=set() - for eachl in open(inSAPsfile,'r'): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] - if ENSEMBLT in ENSEMBLTGinGO: - sENSEMBLTSAPsinGO.add(ENSEMBLT) - return sENSEMBLTSAPsinGO - -def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): - """ - returns a list of the ENSEMBLT codes present in the input list and - in the GO file. The terms in this list are: 'Go Term','# Genes in - the GO Term','# Genes in the list and in the GO Term','Enrichement - of the GO Term for genes in the input list','Genes in the input list - present in the GO term' - """ - getcontext().prec=2#set 2 decimal places - SAPs_all=len(sENSEMBLTSAPsinGO) - NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all - #~ - lp=len(dGOTENSEMBLT) - cnt=0 - #~ - ltfreqs=[] - for echGOT in dGOTENSEMBLT: - cnt+=1 - ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) - CntGO_All=len(dGOTENSEMBLT[echGOT]) - SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) - NoSAPs_GO=CntGO_All-SAPs_GO - pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) - #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) - ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) - #~ - ltfreqs.sort() - ltfreqs.reverse() - outl=[] - cper,crank=Decimal('2'),0 - #~ - for perc,cnt_go,pval,goTerm in ltfreqs: - if perc<cper: - crank+=1 - cper=perc - outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) - #~ - return outl - - -def main(): - #~ - parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') - parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) - parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) - parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) - parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) - parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) - parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) - - args = parser.parse_args() - - inSAPsfile = args.input - inExtnddfile = args.inExtnddfile - saleGOPCount = args.output - columnENSEMBLT = args.columnENSEMBLT - columnENSEMBLTExtndd = args.columnENSEMBLTExtndd - columnGOExtndd = args.columnGOExtndd - - #~ - dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) - sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) - outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) - #~ - saleGOPCount=open(saleGOPCount,'w') - saleGOPCount.write('\n'.join(outl)) - saleGOPCount.close() - #~ - return 0 - -if __name__ == '__main__': - main() - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GOFisher.py +# +# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import os +import sys +from fisher import pvalue +from decimal import Decimal,getcontext + +def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): + """ + """ + dGOTENSEMBLT={} + for eachl in open(inExtnddfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] + GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) + GOTs=GOTs.difference(set(['','U','N'])) + for GOT in GOTs: + try: + dGOTENSEMBLT[GOT].add(ENSEMBLT) + except: + dGOTENSEMBLT[GOT]=set([ENSEMBLT]) + #~ + ##dGOTENSEMBLT.pop('') + ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) + return dGOTENSEMBLT,ENSEMBLTGinGO + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the GO file + """ + sENSEMBLTSAPsinGO=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinGO: + sENSEMBLTSAPsinGO.add(ENSEMBLT) + return sENSEMBLTSAPsinGO + +def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the GO file. The terms in this list are: 'Go Term','# Genes in + the GO Term','# Genes in the list and in the GO Term','Enrichement + of the GO Term for genes in the input list','Genes in the input list + present in the GO term' + """ + getcontext().prec=2#set 2 decimal places + SAPs_all=len(sENSEMBLTSAPsinGO) + NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all + #~ + lp=len(dGOTENSEMBLT) + cnt=0 + #~ + ltfreqs=[] + for echGOT in dGOTENSEMBLT: + cnt+=1 + ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) + CntGO_All=len(dGOTENSEMBLT[echGOT]) + SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) + NoSAPs_GO=CntGO_All-SAPs_GO + pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) + #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) + ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + for perc,cnt_go,pval,goTerm in ltfreqs: + if perc<cper: + crank+=1 + cper=perc + outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) + #~ + return outl + + +def main(): + #~ + parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) + parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) + parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) + parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) + parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) + + args = parser.parse_args() + + inSAPsfile = args.input + inExtnddfile = args.inExtnddfile + saleGOPCount = args.output + columnENSEMBLT = args.columnENSEMBLT + columnENSEMBLTExtndd = args.columnENSEMBLTExtndd + columnGOExtndd = args.columnGOExtndd + + #~ + dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) + sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) + outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) + #~ + saleGOPCount=open(saleGOPCount,'w') + saleGOPCount.write('\n'.join(outl)) + saleGOPCount.close() + #~ + return 0 + +if __name__ == '__main__': + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reorder.py Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +import sys + +def parse_rangelist(string): + rv = [] + + tokens = strip_split(string, ',') + for token in tokens: + int_list = parse_token(token) + for int_val in int_list: + int_val -= 1 + if int_val not in rv: + rv.append(int_val) + + return rv + +def parse_token(token): + values = strip_split(token, '-') + num_values = len(values) + + if num_values not in [1, 2]: + print >> sys.stderr, 'Error: "%s" is not a valid range' % token + sys.exit(1) + + int_list = [] + for value in values: + if value: + int_val = as_int(value) + + if int_val < 1: + print >> sys.stderr, 'Error: "%s" is not >= 1' % value + sys.exit(1) + + int_list.append(int_val) + else: + print >> sys.stderr, 'Error: "%s" is not a valid range' % token + sys.exit(1) + + if num_values == 1: + return int_list + + a, b = int_list + + if a <= b: + return range(a, b+1) + else: + return range(a, b-1, -1) + +def strip_split(string, delim): + return [elem.strip() for elem in string.split(delim)] + +def as_int(string): + try: + val = int(string) + except: + print >> sys.stderr, 'Error: "%s" does not appear to be an integer' % string + sys.exit(1) + return val + +def get_lines(filename): + rv = [] + + fh = open(filename) + for line in fh: + line = line.rstrip('\r\n') + rv.append(line) + fh.close() + + return rv + +def reorder(old_lines, new_order, filename): + max_index = len(old_lines) - 1 + + fh = open(filename, 'w') + + for index in new_order: + if index <= max_index: + print >> fh, old_lines[index] + old_lines[index] = None + + for line in old_lines: + if line is not None: + print >> fh, line + + fh.close() + +if len(sys.argv) != 4: + print >> sys.stderr, "Usage" + sys.exit(1) + +input, output, order_string = sys.argv[1:] + +new_order = parse_rangelist(order_string) +old_lines = get_lines(input) +reorder(old_lines, new_order, output) + +sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reorder.xml Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,19 @@ +<tool id="gd_reorder" name="Reorder" version="1.0.0"> + <description>individuals</description> + + <command interpreter="python"> + reorder.py "$input" "$output" "$order" + </command> + + <inputs> + <param name="input" type="data" format="gd_indivs" label="Individuals dataset" /> + <param name="order" size="40" type="text" value="" label="New order"/> + </inputs> + + <outputs> + <data name="output" format="gd_indivs" metadata_source="input"/> + </outputs> + + <help> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/specify.py Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +import sys +import base64 + +def parse_args(args): + if len(args) < 3: + usage() + + input_file, output_file = args[1:3] + + individuals = [] + checkboxes = [] + strings = [] + + for arg in args[3:]: + if ':' in arg: + arg_type, arg = arg.split(':', 1) + else: + print >> sys.stderr, "unknown argument:", arg + usage() + + if arg_type == 'individual': + individuals.append(arg) + elif arg_type == 'checkbox': + checkboxes.append(arg) + elif arg_type == 'string': + strings.append(arg) + else: + print >> sys.stderr, "unknown argument:", arg + usage() + + return input_file, output_file, individuals, checkboxes, strings + +def usage(): + print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) + sys.exit(1) + +def parse_individuals(individuals): + ind_col2name = {} + ind_name2col = {} + + for individual in individuals: + if ':' in individual: + column, name = individual.split(':', 1) + else: + print >> sys.stderr, "invalid individual specification:", individual + usage() + + try: + column = int(column) + except: + print "individual column is not an integer:", individual + usage() + + if column not in ind_col2name: + ind_col2name[column] = name + else: + if ind_col2name[column] != name: + print "duplicate individual column:", name, column, ind_col2name[column] + usage() + + if name not in ind_name2col: + ind_name2col[name] = [column] + elif column not in ind_name2col[name]: + ind_name2col[name].append(column) + + return ind_col2name, ind_name2col + +def parse_checkboxes(checkboxes, ind_col2name): + columns = [] + + for checkbox in checkboxes: + if ':' in checkbox: + column, name = checkbox.split(':', 1) + else: + print >> sys.stderr, "invalid checkbox specification:", checkbox + usage() + + try: + column = int(column) + except: + print "checkbox column is not an integer:", checkbox + usage() + + if column not in ind_col2name: + print "individual not in SNP table:", name + usage() + + if column not in columns: + columns.append(column) + + return columns + +def parse_strings(strings, ind_col2name, ind_name2col): + columns = [] + + for string in strings: + try: + decoded = base64.b64decode(string) + except: + print >> sys.stderr, "invalid base64 string:", string + usage() + + names = find_names(decoded, ind_name2col.keys()) + for name in names: + cols = ind_name2col[name] + if len(cols) == 1: + col = cols[0] + if col not in columns: + columns.append(col) + else: + print >> sys.stderr, "name with multiple columns:", name + usage() + + return columns + +def find_names(string, names): + rv = [] + for name in names: + if name in string: + if name not in rv: + rv.append(name) + return rv + + + + +input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) +ind_col2name, ind_name2col = parse_individuals(individuals) +cb_cols = parse_checkboxes(checkboxes, ind_col2name) +str_cols = parse_strings(strings, ind_col2name, ind_name2col) + +out_cols = cb_cols +for col in str_cols: + if col not in out_cols: + out_cols.append(col) + +with open(output_file, 'w') as fh: + for col in sorted(out_cols): + print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) + +sys.exit(0) + + + +
--- a/specify.xml Wed May 22 15:58:18 2013 -0400 +++ b/specify.xml Tue May 28 16:24:19 2013 -0400 @@ -1,28 +1,42 @@ -<tool id="gd_specify" name="Specify Individuals" version="1.0.0"> +<tool id="gd_specify" name="Specify Individuals" version="1.1.0"> <description>: Define a collection of individuals from a gd_snp dataset</description> - <command interpreter="bash"> - echo.bash "$input" "$output" - #for $individual in str($individuals).split(',') - #set $individual_idx = $input.dataset.metadata.individual_names.index($individual) - #set $individual_col = str( $input.dataset.metadata.individual_columns[$individual_idx] ) - #set $arg = '\t'.join([$individual_col, $individual, '']) - "$arg" + <command interpreter="python"> + specify.py "$input" "$output" + #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) + #set $individual_arg = 'individual:%s:%s' % ($individual_col, $individual) + "$individual_arg" #end for + #if str($individuals).strip() != 'None' + #for $individual in str($individuals).split(',') + #set $individual_idx = $input.dataset.metadata.individual_names.index($individual) + #set $individual_col = str( $input.dataset.metadata.individual_columns[$individual_idx] ) + #set $cb_arg = 'checkbox:%s:%s' % ($individual_col, $individual) + "$cb_arg" + #end for + #end if + #if str($string).strip() != '' + #set str_arg = 'string:%s' % ( __import__('base64').b64encode( str($string) ) ) + "$str_arg" + #end if </command> <inputs> - <param name="input" type="data" format="gd_snp" label="SNP dataset"/> + <param name="input" type="data" format="gd_snp,gd_genotype" label="SNP or Genotype dataset"/> <param name="individuals" type="select" display="checkboxes" multiple="true" label="Individuals to include"> <options> <filter type="data_meta" ref="input" key="individual_names" /> </options> - <validator type="no_options" message="You must select at least one individual."/> </param> <param name="outname" type="text" size="20" label="Label for this collection"> <validator type="empty_field" message="You must enter a label."/> #used to be "Individuals from ${input.hid}" </param> + <param name="string" type="text" size="40" label="Individuals to include"> + <sanitizer> + <valid initial="string.printable"/> + </sanitizer> + </param> </inputs> <outputs> @@ -41,10 +55,11 @@ **Dataset formats** -The input dataset is in gd_snp_ format; +The input dataset is in gd_snp_ or gd_genotype_ format; the output is in gd_indivs_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _Dataset missing?: ./static/formatHelp.html @@ -52,12 +67,17 @@ **What it does** -This tool makes a list of selected entities (the sets of four columns -representing individuals or groups) from a gd_snp dataset. It does not copy -the SNP data; it just records which entities should be considered as belonging -to some collection or population. The label you specify is used to name the -output dataset in your history. This list can then be used to instruct other -tools to work on just part of the original gd_snp dataset. +This tool makes a list of selected entities, i.e., the sets of four +columns representing individuals or groups from a gd_snp dataset, or +sets of single columns in a gd_genotype file. It does not copy the +data; it just records which entities should be considered as belonging +to some collection or population. The label you specify is used to +name the output dataset in your history. This list can then be used +to instruct other tools to work on just part of the original gd_snp or +gd_genotype dataset. The entities can be specified with the checklist +and/or by pasting their names (possibly with extraneous characters, as +in a portion of the Newick-format output of the Phylogenetic Tree tool) +into the box provided at the bottom of the page. -----