0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 suppressPackageStartupMessages(library("adegenet"))
|
|
4 suppressPackageStartupMessages(library("ape"))
|
|
5 suppressPackageStartupMessages(library("data.table"))
|
|
6 suppressPackageStartupMessages(library("dbplyr"))
|
|
7 suppressPackageStartupMessages(library("dplyr"))
|
|
8 suppressPackageStartupMessages(library("ggplot2"))
|
|
9 suppressPackageStartupMessages(library("knitr"))
|
|
10 suppressPackageStartupMessages(library("maps"))
|
|
11 suppressPackageStartupMessages(library("mapproj"))
|
|
12 suppressPackageStartupMessages(library("optparse"))
|
|
13 suppressPackageStartupMessages(library("poppr"))
|
|
14 suppressPackageStartupMessages(library("RColorBrewer"))
|
|
15 suppressPackageStartupMessages(library("RPostgres"))
|
|
16 suppressPackageStartupMessages(library("SNPRelate"))
|
|
17 suppressPackageStartupMessages(library("tidyr"))
|
|
18 suppressPackageStartupMessages(library("vcfR"))
|
|
19 suppressPackageStartupMessages(library("vegan"))
|
|
20 suppressPackageStartupMessages(library("yarrr"))
|
|
21 theme_set(theme_bw())
|
|
22
|
|
23 DEFAULT_MISSING_NUMERIC_VALUE <- -9.000000;
|
|
24
|
|
25 option_list <- list(
|
|
26 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
|
|
27 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
|
|
28 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
|
|
29 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
|
|
30 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
|
|
31 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
|
|
32 )
|
|
33
|
|
34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
|
|
35 args <- parse_args(parser, positional_arguments=TRUE);
|
|
36 opt <- args$options;
|
|
37
|
|
38 get_file_path = function(dir, file_name) {
|
|
39 file_path = paste(dir, file_name, sep="/");
|
|
40 return(file_path);
|
|
41 }
|
|
42
|
|
43 get_database_connection <- function(db_conn_string) {
|
|
44 # Instantiate database connection.
|
|
45 # The connection string has this format:
|
|
46 # postgresql://user:password@host/dbname
|
|
47 conn_items <- strsplit(db_conn_string, "://")[[1]];
|
|
48 string_needed <- conn_items[2];
|
|
49 items_needed <- strsplit(string_needed, "@")[[1]];
|
|
50 user_pass_string <- items_needed[1];
|
|
51 host_dbname_string <- items_needed[2];
|
|
52 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
|
|
53 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
|
|
54 user <- user_pass_items[1];
|
|
55 pass <- user_pass_items[2];
|
|
56 host <- host_dbname_items[1];
|
|
57 dbname <- host_dbname_items[2];
|
|
58 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
|
|
59 return (conn);
|
|
60 }
|
|
61
|
4
|
62 log_data_frame <- function(name, df) {
|
|
63 cat("\n", name, ":\n");
|
|
64 show(df);
|
|
65 cat("\n\n");
|
|
66 }
|
|
67
|
0
|
68 time_elapsed <- function(start_time) {
|
|
69 cat("Elapsed time: ", proc.time() - start_time, "\n\n");
|
|
70 }
|
|
71
|
|
72 time_start <- function(msg) {
|
|
73 start_time <- proc.time();
|
|
74 cat(msg, "...\n");
|
|
75 return(start_time);
|
|
76 }
|
|
77
|
|
78 write_data_frame <- function(dir, file_name, data_frame) {
|
|
79 cat("\nWriting file: ", file_name, "\n");
|
|
80 file_path <- get_file_path(dir, file_name);
|
|
81 write.table(data_frame, file=file_path, quote=FALSE, row.names=FALSE, sep="\t");
|
|
82 }
|
|
83
|
|
84 # Prepare for processing.
|
|
85 output_data_dir = "output_data_dir";
|
|
86 output_plots_dir = "output_plots_dir";
|
|
87 # Read in VCF input file.
|
|
88 start_time <- time_start("Reading VCF input");
|
|
89 vcf <- read.vcfR(opt$input_vcf);
|
|
90 time_elapsed(start_time);
|
|
91
|
|
92 # Convert VCF file into a genind for the Poppr package.
|
|
93 start_time <- time_start("Converting VCF data to a genind object");
|
|
94 genind_obj <- vcfR2genind(vcf);
|
4
|
95 cat("\ngenind_obj:\n");
|
|
96 genind_obj
|
|
97 cat("\n\n");
|
0
|
98 time_elapsed(start_time);
|
|
99
|
|
100 # Add population information to the genind object.
|
|
101 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t", quote="");
|
|
102 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
|
4
|
103 cat("\npopulation_info_data_table:\n");
|
|
104 population_info_data_table
|
|
105 cat("\n\n");
|
0
|
106 #write_data_frame(output_data_dir, "population_info_data_table", population_info_data_table);
|
|
107 genind_obj@pop <- as.factor(population_info_data_table$region);
|
|
108 strata(genind_obj) <- data.frame(pop(genind_obj));
|
|
109
|
|
110 # Convert genind object to a genclone object.
|
|
111 start_time <- time_start("Converting the genind object to a genclone object");
|
|
112 genind_clone <- as.genclone(genind_obj);
|
4
|
113 cat("\ngenind_clone:\n");
|
|
114 genind_clone
|
|
115 cat("\n\n");
|
0
|
116 time_elapsed(start_time);
|
|
117
|
|
118 # Calculate the bitwise distance between individuals.
|
|
119 start_time <- time_start("Calculating the bitwise distance between individuals");
|
|
120 bitwise_distance <- bitwise.dist(genind_clone);
|
|
121 time_elapsed(start_time);
|
|
122
|
|
123 # Multilocus genotypes (threshold of 3.2%).
|
4
|
124 cat("\nFiltering multilocus genotypes with threshold of 3.2%...\n\n");
|
0
|
125 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
|
|
126
|
|
127 # Create list of MLGs.
|
4
|
128 cat("\nCreating list of mlg_ids...\n\n");
|
0
|
129 mlg_ids <- mlg.id(genind_clone);
|
|
130
|
|
131 # Read user's Affymetrix 96 well plate tabular file.
|
|
132 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings=c("", "NA"), quote="");
|
|
133 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
|
|
134 "region", "latitude", "longitude", "geographic_origin", "colony_location",
|
|
135 "depth", "disease_resist", "bleach_resist", "mortality","tle",
|
|
136 "spawning", "collector_last_name", "collector_first_name", "organization", "collection_date",
|
|
137 "email", "seq_facility", "array_version", "public", "public_after_date",
|
|
138 "sperm_motility", "healing_time", "dna_extraction_method", "dna_concentration", "registry_id",
|
|
139 "result_folder_name", "plate_barcode");
|
|
140 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
|
4
|
141 log_data_frame("affy_metadata_data_frame", affy_metadata_data_frame);
|
0
|
142 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
|
4
|
143 cat("\nuser_specimen_ids:\n", toString(user_specimen_ids), "\n\n");
|
0
|
144 # The specimen_id_field_call_data_table looks like this:
|
|
145 # user_specimen_ids V2
|
|
146 # 1090 prolifera
|
|
147 # 1091 prolifera
|
4
|
148 cat("\nCreating specimen_id_field_call_data_table...\n");
|
0
|
149 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
|
|
150 # Rename the user_specimen_ids column.
|
|
151 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
|
|
152 # Rename the V2 column.
|
|
153 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
|
|
154
|
|
155 # Connect to database.
|
|
156 conn <- get_database_connection(opt$database_connection_string);
|
|
157 # Import the sample table.
|
|
158 sample_table <- tbl(conn, "sample");
|
|
159 # Import the genotype table.
|
|
160 genotype_table <- tbl(conn, "genotype");
|
2
|
161 # Import the probe_annotation table.
|
|
162 probe_annotation_table <- tbl(conn, "probe_annotation");
|
0
|
163 # Select columns from the sample table and the
|
|
164 # genotype table joined by genotype_id.
|
|
165 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, bcoral_genet_id, genotype_id);
|
|
166 smlg <- sample_table_columns %>%
|
|
167 left_join(genotype_table %>%
|
|
168 select("id", "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"),
|
|
169 by=c("genotype_id"="id"));
|
|
170 # Name the columns.
|
|
171 smlg_data_frame <- as.data.frame(smlg);
|
|
172 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id",
|
2
|
173 "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call");
|
4
|
174 log_data_frame("smlg_data_frame", smlg_data_frame);
|
0
|
175 # Missing GT in samples submitted.
|
|
176 start_time <- time_start("Discovering missing GT in samples");
|
|
177 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
|
|
178 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
|
|
179 missing_gt <- (missing_gt / nrow(vcf)) * 100;
|
|
180 missing_gt_data_frame <- data.frame(missing_gt);
|
4
|
181 log_data_frame("missing_gt_data_frame", missing_gt_data_frame);
|
0
|
182 # The specimen_id_field_call_data_table looks like this:
|
|
183 # rn missing_gt
|
|
184 # a100000-4368120-060520-256_I07.CEL 0.06092608
|
|
185 # a100000-4368120-060520-256_K07.CEL 0.05077173
|
4
|
186 cat("\nCreating missing_gt_data_table...\n");
|
|
187 missing_gt_data_table <- setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
|
0
|
188 # Rename the rn column.
|
|
189 setnames(missing_gt_data_table, c("rn"), c("affy_id"));
|
|
190 # Rename the missing_gt column.
|
|
191 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
|
|
192 # Round data to two digits.
|
|
193 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
|
|
194 time_elapsed(start_time);
|
|
195
|
2
|
196 # Subset genotypes for the fixed SNPs by probe id.
|
|
197 # Select columns from the probe_annotation table.
|
|
198 probe_annotation_table_columns <- probe_annotation_table %>% select(probe_set_id, custid, fixed_status, acerv_allele);
|
|
199 # Convert to data frame.
|
|
200 fixed_snp_data_frame <- as.data.frame(probe_annotation_table_columns);
|
|
201 # Name the columns.
|
|
202 colnames(fixed_snp_data_frame) <- c("probe_set_id", "custid", "fixed_status", "acerv_allele");
|
|
203 # Filter unwanted rows.
|
|
204 fixed_snp_data_frame <- subset(fixed_snp_data_frame, fixed_snp_data_frame$fixed_status=="keep");
|
4
|
205 log_data_frame("fixed_snp_data_frame", fixed_snp_data_frame);
|
2
|
206 gt_fixed <- gt[rownames(gt) %in% fixed_snp_data_frame$probe_set_id, ];
|
4
|
207 log_data_frame("gt_fixed", gt_fixed);
|
2
|
208
|
|
209 # Missing GT in fixed SNPs.
|
|
210 missing_gt_fixed <- apply(gt_fixed, MARGIN=2, function(x){ sum(is.na(x))});
|
|
211 missing_gt_fixed <- (missing_gt_fixed / nrow(gt_fixed)) * 100;
|
|
212 missing_gt_fixed_data_frame <- data.frame(missing_gt_fixed);
|
4
|
213 log_data_frame("missing_gt_fixed_data_frame", missing_gt_fixed_data_frame);
|
|
214 cat("\nCreating missing_gt_fixed_data_table...\n");
|
2
|
215 missing_gt_fixed_data_table <- setDT(missing_gt_fixed_data_frame, keep.rownames=TRUE)[];
|
|
216 # Rename the rn column.
|
|
217 setnames(missing_gt_fixed_data_table, c("rn"), c("affy_id"));
|
|
218 # Rename the missing_gt column.
|
|
219 setnames(missing_gt_fixed_data_table, c("missing_gt_fixed"), c("percent_missing_data_fixed"));
|
|
220 # Round data to two digits.
|
|
221 missing_gt_fixed_data_table$percent_missing_data_fixed <- round(missing_gt_fixed_data_table$percent_missing_data_fixed, digits=2);
|
|
222
|
|
223 # Heterozygous alleles for fixed SNPs.
|
0
|
224 start_time <- time_start("Discovering heterozygous alleles");
|
2
|
225 heterozygous_alleles <- apply(gt_fixed, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
|
|
226 heterozygous_alleles <- (heterozygous_alleles / nrow(gt_fixed)) * 100;
|
0
|
227 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
|
4
|
228 log_data_frame("heterozygous_alleles_data_frame", heterozygous_alleles_data_frame);
|
0
|
229 # The heterozygous_alleles_data_table looks like this:
|
|
230 # rn heterozygous_alleles
|
|
231 # a100000-4368120-060520-256_I07.CEL 73.94903
|
|
232 # a100000-4368120-060520-256_K07.CEL 74.40089
|
4
|
233 cat("\nCreating heterozygous_alleles_data_table...\n");
|
0
|
234 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
|
|
235 # Rename the rn column.
|
|
236 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
|
|
237 # Rename the heterozygous_alleles column.
|
|
238 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
|
|
239 # Round data to two digits.
|
|
240 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
|
|
241 time_elapsed(start_time);
|
|
242
|
2
|
243 # Create list of Acerv reference and alternative probes.
|
|
244 rAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="reference");
|
|
245 aAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="alternative");
|
|
246
|
|
247 # Subset probes for the reference and alternative SNPs in Acerv.
|
|
248 ref_ac <- gt_fixed[rownames(gt_fixed) %in% rAC$probe,];
|
|
249 alt_ac <- gt_fixed[rownames(gt_fixed) %in% aAC$probe,];
|
|
250
|
|
251 # Reference alleles for each species.
|
|
252 reference_alleles_ac <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
|
|
253 reference_alleles_ap <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
|
|
254
|
|
255 # Alternative alleles for each species.
|
|
256 alternative_alleles_ac <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
|
|
257 alternative_alleles_ap <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
|
|
258
|
|
259 # Apalm alleles.
|
0
|
260 start_time <- time_start("Discovering reference alleles");
|
2
|
261 ap_sum <- rowSums(cbind(reference_alleles_ap,alternative_alleles_ap));
|
|
262 ap_alleles <- (ap_sum / nrow(gt_fixed)) * 100;
|
|
263 ap_alleles_data_frame <- data.frame(ap_alleles);
|
4
|
264 log_data_frame("ap_alleles_data_frame", ap_alleles_data_frame);
|
0
|
265 # The reference_alleles_data_table looks like this:
|
|
266 # rn reference_alleles
|
|
267 # a100000-4368120-060520-256_I07.CEL 11.60642
|
|
268 # a100000-4368120-060520-256_K07.CEL 11.45918
|
4
|
269 cat("\nCreating ap_alleles_data_table...\n");
|
2
|
270 ap_alleles_data_table <- setDT(ap_alleles_data_frame, keep.rownames=TRUE)[];
|
0
|
271 # Rename the rn column.
|
2
|
272 setnames(ap_alleles_data_table, c("rn"), c("affy_id"));
|
0
|
273 # Rename the reference_alleles column.
|
2
|
274 setnames(ap_alleles_data_table, c("ap_alleles"), c("percent_apalm_coral"));
|
0
|
275 # Round data to two digits.
|
2
|
276 ap_alleles_data_table$percent_apalm_coral <- round(ap_alleles_data_table$percent_apalm_coral, digits=2);
|
0
|
277 time_elapsed(start_time);
|
|
278
|
2
|
279 # Acerv alleles.
|
0
|
280 start_time <- time_start("Discovering alternative alleles");
|
2
|
281 ac_sum <- rowSums(cbind(reference_alleles_ac,alternative_alleles_ac));
|
|
282 ac_alleles <- (ac_sum / nrow(gt_fixed)) * 100;
|
|
283 ac_alleles_data_frame <- data.frame(ac_alleles);
|
4
|
284 log_data_frame("ac_alleles_data_frame", ac_alleles_data_frame);
|
0
|
285 # The alternative_alleles_data_table looks like this:
|
|
286 # rn alternative_alleles
|
|
287 # a100000-4368120-060520-256_I07.CEL 14.38363
|
|
288 # a100000-4368120-060520-256_K07.CEL 14.08916
|
4
|
289 cat("\nCreating ac_alleles_data_table...\n");
|
2
|
290 ac_alleles_data_table <- setDT(ac_alleles_data_frame, keep.rownames=TRUE)[];
|
0
|
291 # Rename the rn column.
|
2
|
292 setnames(ac_alleles_data_table, c("rn"), c("affy_id"));
|
0
|
293 # Rename the alternative_alleles column.
|
2
|
294 setnames(ac_alleles_data_table, c("ac_alleles"), c("percent_acerv_coral"));
|
0
|
295 # Round data to two digits.
|
2
|
296 ac_alleles_data_table$percent_acerv_coral <- round(ac_alleles_data_table$percent_acerv_coral, digits=2);
|
0
|
297 time_elapsed(start_time);
|
|
298
|
|
299 # The mlg_ids_data_table looks like this:
|
|
300 # mlg_ids
|
|
301 # a550962-4368120-060520-500_M23.CEL
|
|
302 # a550962-4368120-060520-256_A19.CEL
|
4
|
303 cat("\nCreating mlg_ids_data_table...\n");
|
0
|
304 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
|
|
305 # Rename the mlg_ids column.
|
|
306 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
|
|
307
|
|
308 # sample_mlg_tibble looks like this:
|
|
309 # A tibble: 262 x 3
|
|
310 # Groups: group [?]
|
|
311 # group affy_id coral_mlg_clonal_id coral_mlg_rep_sample_id
|
|
312 # <int> <chr> <chr> <chr>
|
|
313 # 1 a550962-4368.CEL NA 13905
|
|
314 sample_mlg_tibble <- mlg_ids_data_table %>%
|
|
315 group_by(row_number()) %>%
|
|
316 dplyr::rename(group="row_number()") %>%
|
|
317 unnest (affy_id) %>%
|
|
318 # Join with mlg table.
|
|
319 left_join(smlg_data_frame %>%
|
|
320 select("affy_id","coral_mlg_clonal_id", "coral_mlg_rep_sample_id"),
|
|
321 by="affy_id");
|
|
322
|
|
323 # If found in database, group members on previous mlg id.
|
|
324 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
|
|
325 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
|
|
326 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
|
|
327 na.group <- sample_mlg_tibble$group[na.mlg];
|
|
328 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
|
|
329
|
|
330 # Find out if the sample mlg matched a previous genotyped sample.
|
|
331 # sample_mlg_match_tibble looks like this:
|
|
332 # A tibble: 262 x 4
|
|
333 # Groups: group [230]
|
|
334 # group affy_id coral_mlg_clonal_id db_match
|
|
335 # <int> <chr> <chr> <chr>
|
|
336 # 1 a550962-436.CEL NA no_match
|
|
337 sample_mlg_match_tibble <- sample_mlg_tibble %>%
|
|
338 group_by(group) %>%
|
|
339 mutate(db_match = ifelse(is.na(coral_mlg_clonal_id), "no_match", "match"));
|
|
340
|
|
341 # Create new mlg id for samples with no matches in the database.
|
|
342 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
|
|
343 none <- none[is.na(none$coral_mlg_clonal_id),];
|
|
344 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
|
|
345 n.g <- sample_mlg_match_tibble$group[na.mlg2];
|
|
346 ct <- length(unique(n.g));
|
|
347
|
|
348 # List of new group ids, the sequence starts at the number of
|
|
349 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
|
|
350 # plus 1.
|
|
351 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
|
|
352
|
|
353 # Assign the new id iteratively for all that have NA.
|
|
354 for (i in 1:length(na.mlg2)) {
|
|
355 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
|
|
356 }
|
|
357
|
|
358 # Subset population_info_data_table for all samples.
|
|
359 # affy_id_user_specimen_id_vector looks like this:
|
|
360 # affy_id user_specimen_id
|
|
361 # a100000-432.CEL 13704
|
|
362 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
|
|
363
|
|
364 # Merge data frames for final table.
|
|
365 start_time <- time_start("Merging data frames");
|
|
366 stag_db_report <- specimen_id_field_call_data_table %>%
|
|
367 left_join(affy_id_user_specimen_id_vector %>%
|
|
368 select("affy_id", "user_specimen_id"),
|
|
369 by="user_specimen_id") %>%
|
|
370 mutate(db_record = ifelse(affy_id %in% smlg_data_frame$affy_id, "genotyped", "new")) %>%
|
|
371 filter(db_record=="new") %>%
|
|
372 left_join(sample_mlg_match_tibble %>%
|
|
373 select("affy_id", "coral_mlg_clonal_id", "db_match"),
|
|
374 by="affy_id") %>%
|
|
375 left_join(missing_gt_data_table %>%
|
|
376 select("affy_id", "percent_missing_data_coral"),
|
|
377 by="affy_id") %>%
|
2
|
378 left_join(missing_gt_fixed_data_table %>%
|
|
379 select("affy_id", "percent_missing_data_fixed"),
|
|
380 by="affy_id") %>%
|
0
|
381 left_join(heterozygous_alleles_data_table %>%
|
|
382 select("affy_id", "percent_heterozygous_coral"),
|
|
383 by="affy_id") %>%
|
2
|
384 left_join(ac_alleles_data_table %>%
|
1
|
385 select("affy_id", "percent_acerv_coral"),
|
0
|
386 by="affy_id") %>%
|
2
|
387 left_join(ap_alleles_data_table %>%
|
1
|
388 select("affy_id", "percent_apalm_coral"),
|
0
|
389 by="affy_id") %>%
|
|
390 mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>%
|
|
391 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
|
2
|
392 mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 85 & percent_acerv_coral <= 10, "A.palmata", "other")) %>%
|
|
393 mutate(genetic_coral_species_call = ifelse(percent_acerv_coral >= 85 & percent_apalm_coral <= 10, "A.cervicornis", genetic_coral_species_call)) %>%
|
0
|
394 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
|
|
395 ungroup() %>%
|
2
|
396 select(-group, -db_record);
|
0
|
397 time_elapsed(start_time);
|
|
398
|
|
399 start_time <- time_start("Writing csv output");
|
|
400 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
|
|
401 time_elapsed(start_time);
|
|
402
|
|
403 # Representative clone for genotype table.
|
|
404 start_time <- time_start("Creating representative clone for genotype table");
|
2
|
405 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata=~pop.genind_obj.);
|
0
|
406 id_rep <- mlg.id(no_dup_genotypes_genind);
|
4
|
407 cat("\nCreating id_data_table...\n");
|
0
|
408 id_data_table <- data.table(id_rep, keep.rownames=TRUE);
|
|
409 # Rename the id_rep column.
|
|
410 setnames(id_data_table, c("id_rep"), c("affy_id"));
|
|
411 time_elapsed(start_time);
|
|
412
|
|
413 # Table of alleles for the new samples subset to new plate data.
|
|
414 # Create vector indicating number of individuals desired from
|
|
415 # affy_id column of stag_db_report data table.
|
|
416 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]);
|
2
|
417 i <- i[!apply(i== "", 1, all), ];
|
0
|
418
|
|
419 # Subset VCF to the user samples.
|
|
420 start_time <- time_start("Subsetting vcf to the user samples");
|
6
|
421 affy_list <- append(stag_db_report$affy_id,"FORMAT");
|
|
422 svcf <- vcf[,colnames(vcf@gt) %in% affy_list];
|
0
|
423 write.vcf(svcf, "subset.vcf.gz");
|
|
424 vcf.fn <- "subset.vcf.gz";
|
|
425 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
|
|
426 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
|
|
427 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
|
|
428 # gds_array looks like this:
|
|
429 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
|
|
430 gds_data_frame <- data.frame(gds_array);
|
4
|
431 log_data_frame("gds_data_frame", gds_data_frame);
|
0
|
432 # gds_data_frame looks like this:
|
|
433 # gds_array
|
|
434 # a550962-4368120-060520-500_A03.CEL
|
|
435 # a550962-4368120-060520-500_A05.CEL
|
4
|
436 cat("\nCreating gds_data_table...\n");
|
0
|
437 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
|
|
438 # Rename the gds_array column.
|
|
439 setnames(gds_data_table, c("gds_array"), c("affy_id"));
|
|
440 # affy_id_region_list looks like this:
|
|
441 # affy_id region
|
|
442 # a100000-4368120-060520-256_I07.CEL USVI
|
|
443 # a100000-4368120-060520-256_K07.CEL USVI
|
|
444 affy_id_region_list <- population_info_data_table[c(2,3,4)];
|
|
445 gds_data_table_join <- gds_data_table %>%
|
|
446 left_join(affy_id_region_list %>%
|
2
|
447 select("affy_id", "user_specimen_id", "region"),
|
0
|
448 by='affy_id')%>%
|
|
449 drop_na();
|
|
450 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
|
|
451 add.gdsn(genofile, "sample.annot", samp.annot);
|
|
452 # population_code looks like this:
|
|
453 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
|
|
454 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
|
|
455 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
|
|
456 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
|
|
457 # pop.group looks like this:
|
|
458 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
|
|
459 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
|
|
460 time_elapsed(start_time);
|
|
461
|
|
462 # Distance matrix calculation and sample labels change to user specimen ids.
|
|
463 start_time <- time_start("Calculating distance matrix");
|
|
464 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
|
|
465 ibs$sample.id <-gds_data_table_join$user_specimen_id;
|
|
466 time_elapsed(start_time);
|
|
467
|
|
468 # Cluster analysis on the genome-wide IBS pairwise distance matrix.
|
|
469 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
|
|
470 set.seed(100);
|
2
|
471 par(cex=0.6, cex.lab=1, cex.axis=1.5, cex.main=2);
|
0
|
472 ibs.hc <- snpgdsHCluster(ibs);
|
|
473 time_elapsed(start_time);
|
|
474
|
|
475 # cols looks like this:
|
|
476 # blue1 red green pink orange blue2
|
|
477 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF"
|
|
478 # green2 yellow turquoise poop
|
|
479 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
|
|
480 cols <- piratepal("basel");
|
|
481 set.seed(999);
|
|
482
|
|
483 # Generate plots.
|
|
484 # Default clustering.
|
|
485 start_time <- time_start("Creating ibs_default.pdf");
|
|
486 # Start PDF device driver.
|
|
487 dev.new(width=40, height=20);
|
|
488 file_path = get_file_path(output_plots_dir, "ibs_default.pdf");
|
|
489 pdf(file=file_path, width=40, height=20);
|
|
490 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
|
|
491 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", yaxis.kinship=FALSE);
|
2
|
492 abline(h=0.032, lty=2);
|
0
|
493 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
|
|
494 dev.off()
|
|
495 time_elapsed(start_time);
|
|
496
|
|
497 # Color cluster by region.
|
|
498 start_time <- time_start("Creating ibs_region.pdf");
|
|
499 # Start PDF device driver.
|
|
500 dev.new(width=40, height=20);
|
|
501 file_path = get_file_path(output_plots_dir, "ibs_region.pdf");
|
|
502 pdf(file=file_path, width=40, height=20);
|
|
503 race <- as.factor(population_code);
|
2
|
504 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race, col.list=cols, pch.list=15);
|
0
|
505 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", yaxis.kinship=FALSE);
|
|
506 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
|
|
507 dev.off()
|
|
508 time_elapsed(start_time);
|
|
509
|
|
510 # Missing data barplot.
|
|
511 start_time <- time_start("Creating missing_data.pdf");
|
|
512 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
|
|
513 test2 <- which(!is.na(population_info_data_table$miss));
|
|
514 miss96 <- population_info_data_table$miss[test2];
|
|
515 name96 <- population_info_data_table$user_specimen_id[test2];
|
|
516 # Start PDF device driver.
|
|
517 dev.new(width=20, height=10);
|
|
518 file_path = get_file_path(output_plots_dir, "missing_data.pdf");
|
|
519 pdf(file=file_path, width=20, height=10);
|
|
520 par(mar = c(8, 4, 4, 2));
|
|
521 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
|
|
522 text(cex=0.8, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
|
|
523 dev.off()
|
|
524 time_elapsed(start_time);
|
|
525
|
|
526 # Sample MLG on a map.
|
|
527 start_time <- time_start("Creating mlg_map.pdf");
|
|
528 # Get the lattitude and longitude boundaries for rendering
|
|
529 # the map. Tese boundaries will restrict the map to focus
|
|
530 # (i.e., zoom) on the region of the world map from which
|
|
531 # the samples were taken.
|
|
532 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
|
|
533 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
|
|
534 latitude_range_vector <- c(min_latitude-3, max_latitude+3);
|
|
535 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
|
|
536 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
|
|
537 longitude_range_vector <- c(min_longitude-3, max_longitude+3);
|
|
538 # Get the palette colors for rendering plots.
|
|
539 colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
|
|
540 # Get a color palette.
|
|
541 palette <- colorRampPalette(piratepal("basel"));
|
|
542 # Start PDF device driver.
|
|
543 dev.new(width=20, height=20);
|
|
544 file_path = get_file_path(output_plots_dir, "mlg_map.pdf");
|
|
545 pdf(file=file_path, width=20, height=20);
|
|
546 world_data = map_data("world");
|
|
547 # Add the coral_mlg_clonal_id column from the stag_db_report
|
|
548 # data fram to the affy_metadata_data_frame.
|
|
549 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
|
|
550 # Get the number of colors needed from the palette for plotting
|
|
551 # the sample locations on the world map.
|
|
552 num_colors = length(unique(affy_metadata_data_frame$mlg));
|
|
553 # Get a color palette.
|
|
554 palette = colorRampPalette(piratepal("basel"));
|
|
555 ggplot() +
|
|
556 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
|
|
557 coord_quickmap(xlim=longitude_range_vector, ylim=latitude_range_vector) +
|
|
558 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.7, size=3) +
|
|
559 scale_color_manual(values=palette(num_colors)) +
|
|
560 theme(legend.position="bottom") +
|
|
561 guides(color=guide_legend(nrow=8, byrow=F));
|
|
562
|
|
563 # Sample MLG on a map for each region.
|
|
564 for (i in unique(affy_metadata_data_frame$region)) {
|
2
|
565 m <- i;
|
|
566 num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)]));
|
|
567 max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
|
|
568 min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
|
|
569 latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5);
|
|
570 max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
|
|
571 min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
|
|
572 longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5);
|
|
573 print(ggplot() +
|
|
574 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region),
|
|
575 fill="grey", colour="#7f7f7f") +
|
|
576 coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") +
|
|
577 geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),],
|
|
578 aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.5, size=3) +
|
|
579 scale_color_manual(values=palette(num_colors_2)) +
|
|
580 theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) +
|
|
581 guides(color=guide_legend(nrow=8, byrow=F)));
|
0
|
582 }
|
|
583 dev.off()
|
|
584 time_elapsed(start_time);
|
|
585
|
|
586 if (!is.null(opt$output_nj_phylogeny_tree)) {
|
|
587 # Create a phylogeny tree of samples based on distance matrices.
|
|
588 # Start PDF device driver.
|
|
589 start_time <- time_start("Creating nj_phylogeny_tree.pdf");
|
|
590 # Table of alleles for the new samples subset to new plate data.
|
|
591 # Create vector indicating number of individuals desired from
|
|
592 # affy_id column of stag_db_report data table.
|
|
593 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
|
|
594 i <- i[!apply(i== "", 1, all),];
|
|
595 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
|
|
596 dev.new(width=40, height=80);
|
|
597 file_path = get_file_path(output_plots_dir, "nj_phylogeny_tree.pdf");
|
|
598 pdf(file=file_path, width=40, height=80);
|
|
599 # Organize branches by clade.
|
|
600 nj_phylogeny_tree <- sample_alleles_vector %>%
|
|
601 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE, showtree = FALSE) %>%
|
|
602 ladderize();
|
|
603 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
|
|
604 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0025, cex=0.6, font=2, lwd=4, align.tip.label=F, no.margin=T);
|
|
605 # Add a scale bar showing 5% difference.
|
|
606 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=2);
|
|
607 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
|
|
608 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
|
|
609 dev.off()
|
|
610 time_elapsed(start_time);
|
|
611 }
|
|
612
|
|
613 # Generate a pie chart for each sample with a genotype.
|
|
614 # Store the numerical and user_specimen_id values from
|
|
615 # stag_db_report for the charts (user_specimen_id names
|
|
616 # will be used to label each chart).
|
|
617 start_time <- time_start("Creating percent_breakdown.pdf");
|
3
|
618 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4, -5)];
|
0
|
619 # Remove NA and NaN values.
|
|
620 stag_db_report_data_table <- na.omit(stag_db_report_data_table);
|
|
621 # Translate to N (i.e., number of samples with a genotype)
|
|
622 # columns and 5 rows.
|
|
623 translated_stag_db_report_data_table <- t(stag_db_report_data_table);
|
|
624 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
|
|
625 # Set the storage mode of the matrix to numeric. In some
|
|
626 # cases this could result in the following:
|
|
627 # Warning message:
|
|
628 # In mde(x) : NAs introduced by coercion
|
|
629 mode(translated_stag_db_report_matrix) <- "numeric";
|
|
630 # Remove NA and NaN values that may have been introduced
|
|
631 # by coercion.
|
|
632 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
|
2
|
633 translated_stag_db_report_matrix<-translated_stag_db_report_matrix[-c(1),];
|
|
634 #tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
|
0
|
635 dev.new(width=10, height=7);
|
|
636 file_path = get_file_path(output_plots_dir, "percent_breakdown.pdf");
|
|
637 pdf(file=file_path, width=10, height=7);
|
|
638 par(mfrow=c(3, 2));
|
2
|
639 col <- c("#A6A6A6","#FFA626","#EB0ACF", "#80FF00" );
|
0
|
640 # Generate a pie chart for each sample with genotypes.
|
|
641 for (i in 1:ncol(translated_stag_db_report_matrix)) {
|
2
|
642 tmp_labels <- paste(c("no call", "heterozygous", "A. cervicornis", "A. palmata"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
|
0
|
643 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
|
|
644 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
|
|
645 }
|
|
646 dev.off()
|
|
647 time_elapsed(start_time);
|
|
648
|
|
649 # close GDS file.
|
|
650 snpgdsClose(genofile);
|
|
651
|
|
652 # Prepare to output data frames for input to a downstream
|
|
653 # tool that will use them to update the stag database.
|
|
654 start_time <- time_start("Building data frames for insertion into database tables");
|
|
655 # sample_prep_data_frame looks like this (split across comment lines):
|
|
656 # user_specimen_id field_call bcoral_genet_id bsym_genet_id reef
|
|
657 # test_002 prolifera NA NA JohnsonsReef
|
|
658 # region latitude longitude geographic_origin colony_location
|
|
659 # Bahamas 18.36173 -64.77430 Reef NA
|
|
660 # depth disease_resist bleach_resist
|
|
661 # 5 NA N
|
|
662 # mortality tle spawning collector_last_name collector_first_name organization
|
|
663 # NA NA False Kitchen Sheila Penn State
|
|
664 # collection_date email seq_facility array_version public
|
|
665 # 2018-11-08 k89@psu.edu Affymetrix 1 True
|
|
666 # public_after_date sperm_motility healing_time dna_extraction_method
|
|
667 # NA -9 -9 NA
|
|
668 # dna_concentration registry_id result_folder_name plate_barcode mlg
|
|
669 # NA NA PRO100175_PSU175_SAX_b02 P9SR10074 HG0227
|
|
670 # affy_id percent_missing_data_coral percent_heterozygous_coral
|
|
671 # a550962-436.CEL 1.06 19.10
|
1
|
672 # percent_acerv_coral percent_apalm_coral
|
0
|
673 # 40.10459 39.73396
|
|
674 sample_prep_data_frame <- affy_metadata_data_frame %>%
|
|
675 left_join(stag_db_report %>%
|
|
676 select("user_specimen_id", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
|
1
|
677 "percent_acerv_coral", "percent_apalm_coral"),
|
0
|
678 by='user_specimen_id');
|
|
679 # Get the number of rows for all data frames.
|
|
680 num_rows <- nrow(sample_prep_data_frame);
|
|
681 # Set the column names so that we can extract only those columns
|
|
682 # needed for the sample table.
|
|
683 colnames(sample_prep_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
|
|
684 "region", "latitude", "longitude", "geographic_origin", "colony_location",
|
|
685 "depth", "disease_resist", "bleach_resist", "mortality", "tle",
|
|
686 "spawning", "collector_last_name", "collector_first_name", "organization",
|
|
687 "collection_date", "email", "seq_facility", "array_version", "public",
|
|
688 "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method",
|
|
689 "dna_concentration", "registry_id", "result_folder_name", "plate_barcode",
|
|
690 "mlg", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
|
1
|
691 "percent_acerv_coral", "percent_apalm_coral");
|
0
|
692
|
|
693 # Output the data frame for updating the alleles table.
|
|
694 # Subset to only the new plate data.
|
|
695 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]);
|
|
696 # Create a vector indicating the number of individuals desired
|
|
697 # from the affy_id collumn in the report_user data table.
|
|
698 i <- i[!apply(i=="", 1, all),];
|
|
699 # Subset the genclone object to the user data.
|
|
700 allele_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
|
|
701 # Convert the subset genclone to a data frame.
|
|
702 allele_data_frame <- genind2df(allele_vector, sep="");
|
|
703 allele_data_frame <- allele_data_frame %>%
|
|
704 select(-pop);
|
|
705 # Allele string for Allele.table in database.
|
|
706 allele_table_data_frame <- unite(allele_data_frame, alleles, 1:19696, sep=" ", remove=TRUE);
|
|
707 allele_table_data_frame <- setDT(allele_table_data_frame, keep.rownames=TRUE)[];
|
|
708 setnames(allele_table_data_frame, c("rn"), c("affy_id"));
|
|
709 # write.csv(concat_sample_alleles,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE);
|
|
710 write_data_frame(output_data_dir, "allele.tabular", allele_table_data_frame);
|
|
711
|
|
712 # Output the data frame for updating the experiment table.
|
|
713 experiment_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
|
|
714 colnames(experiment_table_data_frame) <- c("seq_facility", "array_version", "result_folder_name", "plate_barcode");
|
|
715 for (i in 1:num_rows) {
|
|
716 experiment_table_data_frame$seq_facility[i] <- sample_prep_data_frame$seq_facility[i];
|
|
717 experiment_table_data_frame$array_version[i] <- sample_prep_data_frame$array_version[i];
|
|
718 experiment_table_data_frame$result_folder_name[i] <- sample_prep_data_frame$result_folder_name[i];
|
|
719 experiment_table_data_frame$plate_barcode[i] <- sample_prep_data_frame$plate_barcode[i];
|
|
720 }
|
|
721 write_data_frame(output_data_dir, "experiment.tabular", experiment_table_data_frame);
|
|
722
|
|
723 # Output the data frame for updating the colony table.
|
|
724 # The geographic_origin value is used for deciding into which table
|
|
725 # to insert the latitude and longitude values. If the geographic_origin
|
|
726 # is "reef", the values will be inserted into the reef table, and if it is
|
|
727 # "colony", the values will be inserted into the colony table. We insert
|
|
728 # these values in both data frames so that the downstream tool that parses
|
|
729 # them can determine the appropriate table.
|
|
730 colony_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
|
|
731 colnames(colony_table_data_frame) <- c("latitude", "longitude", "depth", "geographic_origin");
|
|
732 for (i in 1:num_rows) {
|
|
733 colony_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i];
|
|
734 colony_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i];
|
|
735 colony_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i];
|
|
736 colony_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i];
|
|
737 }
|
|
738 write_data_frame(output_data_dir, "colony.tabular", colony_table_data_frame);
|
|
739
|
|
740 # Output the data frame for populating the genotype table.
|
|
741 # Combine with previously genotyped samples.
|
|
742 # prep_genotype_tibble looks like this:
|
|
743 # A tibble: 220 x 7
|
|
744 # Groups: group [?]
|
|
745 # group affy_id coral_mlg_clona… user_specimen_id db_match
|
|
746 # <int> <chr> <chr> <chr> <chr>
|
|
747 # 1 a10000… 13905 HG0048 match
|
|
748 # genetic_coral_species_call coral_mlg_rep_sample_id
|
|
749 # <chr> <chr>
|
|
750 # A.palmata 1104
|
|
751 prep_genotype_tibble <- id_data_table %>%
|
|
752 group_by(row_number()) %>%
|
|
753 dplyr::rename(group='row_number()') %>%
|
|
754 unnest(affy_id) %>%
|
|
755 left_join(smlg_data_frame %>%
|
|
756 select("affy_id", "coral_mlg_rep_sample_id", "coral_mlg_clonal_id", "user_specimen_id",
|
|
757 "genetic_coral_species_call", "bcoral_genet_id"),
|
|
758 by='affy_id');
|
|
759 # Confirm that the representative mlg is the same between runs.
|
|
760 uniques2 <- unique(prep_genotype_tibble[c("group", "coral_mlg_rep_sample_id")]);
|
|
761 uniques2 <- uniques2[!is.na(uniques2$coral_mlg_rep_sample_id),];
|
|
762 na.mlg3 <- which(is.na(prep_genotype_tibble$coral_mlg_rep_sample_id));
|
|
763 na.group2 <- prep_genotype_tibble$group[na.mlg3];
|
|
764 prep_genotype_tibble$coral_mlg_rep_sample_id[na.mlg3] <- uniques2$coral_mlg_rep_sample_id[match(na.group2, uniques2$group)];
|
|
765 # Transform the representative mlg column with new genotyped samples.
|
|
766 # representative_mlg_tibble looks like this:
|
|
767 # A tibble: 220 x 5
|
|
768 # affy_id coral_mlg_rep_sa… coral_mlg_clona… user_specimen_id
|
|
769 # <chr> <chr> <chr> <chr>
|
|
770 # a100000-… 13905 HG0048 13905
|
|
771 # genetic_coral_species_call bcoral_genet_id
|
|
772 # <chr> <chr>
|
|
773 # A.palmata C1651
|
|
774 representative_mlg_tibble <- prep_genotype_tibble %>%
|
|
775 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id), affy_id, coral_mlg_rep_sample_id)) %>%
|
|
776 ungroup() %>%
|
|
777 select(-group);
|
|
778 # prep_genotype_table_tibble looks like this:
|
|
779 # affy_id coral_mlg_clonal_id user_specimen_id db_match
|
|
780 # a550962...CEL HG0120 1090 match
|
|
781 # genetic_coral_species_call coral_mlg_rep_sample_id
|
|
782 # A.palmata 1104
|
|
783 prep_genotype_table_tibble <- stag_db_report %>%
|
|
784 select("affy_id", "coral_mlg_clonal_id", "user_specimen_id", "db_match", "genetic_coral_species_call") %>%
|
|
785 left_join(representative_mlg_tibble %>%
|
|
786 select("affy_id", "coral_mlg_rep_sample_id"),
|
|
787 by='affy_id');
|
|
788 # genotype_table_tibble looks like this:
|
|
789 # affy_id coral_mlg_clonal_id user_specimen_id db_match
|
|
790 # a550962-436.CEL HG0120 1090 match
|
|
791 # genetic_coral_species_call coral_mlg_rep_sample_id bcoral_genet_id
|
|
792 # A.palmata 1104 <NA>
|
|
793 genotype_table_tibble <- prep_genotype_table_tibble %>%
|
|
794 left_join(affy_metadata_data_frame %>%
|
|
795 select("user_specimen_id", "bcoral_genet_id"),
|
|
796 by='user_specimen_id');
|
|
797 write_data_frame(output_data_dir, "genotype.tabular", genotype_table_tibble);
|
|
798
|
|
799 # Output the file needed for populating the person table.
|
|
800 person_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
|
|
801 colnames(person_table_data_frame) <- c("last_name", "first_name", "organization", "email");
|
|
802 # person_table_data_frame looks like this:
|
|
803 # last_name first_name organization email
|
|
804 # Kitchen Sheila Penn State s89@psu.edu
|
|
805 for (i in 1:num_rows) {
|
|
806 person_table_data_frame$last_name[i] <- sample_prep_data_frame$collector_last_name[i];
|
|
807 person_table_data_frame$first_name[i] <- sample_prep_data_frame$collector_first_name[i];
|
|
808 person_table_data_frame$organization[i] <- sample_prep_data_frame$organization[i];
|
|
809 person_table_data_frame$email[i] <- sample_prep_data_frame$email[i];
|
|
810 }
|
|
811 write_data_frame(output_data_dir, "person.tabular", person_table_data_frame);
|
|
812
|
|
813 # Output the file needed for populating the phenotype table.
|
|
814 phenotype_table_data_frame <- data.frame(matrix(ncol=7, nrow=num_rows));
|
|
815 colnames(phenotype_table_data_frame) <- c("disease_resist", "bleach_resist", "mortality", "tle",
|
|
816 "spawning", "sperm_motility", "healing_time");
|
|
817 # phenotype_table_data_frame looks like this:
|
|
818 # disease_resist bleach_resist mortality tle spawning sperm_motility healing_time
|
|
819 # NA NA NA NA False NA NA
|
|
820 for (i in 1:num_rows) {
|
|
821 phenotype_table_data_frame$disease_resist[i] <- sample_prep_data_frame$disease_resist[i];
|
|
822 phenotype_table_data_frame$bleach_resist[i] <- sample_prep_data_frame$bleach_resist[i];
|
|
823 phenotype_table_data_frame$mortality[i] <- sample_prep_data_frame$mortality[i];
|
|
824 phenotype_table_data_frame$tle[i] <- sample_prep_data_frame$tle[i];
|
|
825 phenotype_table_data_frame$spawning[i] <- sample_prep_data_frame$spawning[i];
|
|
826 phenotype_table_data_frame$sperm_motility[i] <- sample_prep_data_frame$sperm_motility[i];
|
|
827 phenotype_table_data_frame$healing_time[i] <- sample_prep_data_frame$healing_time[i];
|
|
828 }
|
|
829 write_data_frame(output_data_dir, "phenotype.tabular", phenotype_table_data_frame);
|
|
830
|
|
831 # Output the file needed for populating the reef table.
|
|
832 reef_table_data_frame <- data.frame(matrix(ncol=5, nrow=num_rows));
|
|
833 colnames(reef_table_data_frame) <- c("name", "region", "latitude", "longitude", "geographic_origin");
|
|
834 # The geographic_origin value is used for deciding into which table
|
|
835 # to insert the latitude and longitude values. If the geographic_origin
|
|
836 # is "reef", the values will be inserted into the reef table, and if it is
|
|
837 # "colony", the values will be inserted into the colony table. We insert
|
|
838 # these values in both data frames so that the downstream tool that parses
|
|
839 # them can determine the appropriate table.
|
|
840 # reef_table_data_frame looks like this:
|
|
841 # name region latitude longitude geographic_origin
|
|
842 # JohnsonsReef Bahamas 18.361733 -64.7743 Reef
|
|
843 for (i in 1:num_rows) {
|
|
844 reef_table_data_frame$name[i] <- sample_prep_data_frame$reef[i];
|
|
845 reef_table_data_frame$region[i] <- sample_prep_data_frame$region[i];
|
|
846 reef_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i];
|
|
847 reef_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i];
|
|
848 reef_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i];
|
|
849 }
|
|
850 write_data_frame(output_data_dir, "reef.tabular", reef_table_data_frame);
|
|
851
|
|
852 # Output the file needed for populating the sample table.
|
|
853 sample_table_data_frame <- data.frame(matrix(ncol=20, nrow=num_rows));
|
|
854 colnames(sample_table_data_frame) <- c("affy_id", "colony_location", "collection_date", "user_specimen_id",
|
|
855 "registry_id", "depth", "dna_extraction_method", "dna_concentration",
|
|
856 "public", "public_after_date", "percent_missing_data_coral",
|
1
|
857 "percent_missing_data_sym", "percent_acerv_coral",
|
|
858 "percent_reference_sym", "percent_apalm_coral",
|
0
|
859 "percent_alternative_sym", "percent_heterozygous_coral",
|
|
860 "percent_heterozygous_sym", "field_call", "bcoral_genet_id");
|
|
861 for (i in 1:num_rows) {
|
|
862 sample_table_data_frame$affy_id[i] <- sample_prep_data_frame$affy_id[i];
|
|
863 sample_table_data_frame$colony_location[i] <- sample_prep_data_frame$colony_location[i];
|
|
864 sample_table_data_frame$collection_date[i] <- sample_prep_data_frame$collection_date[i];
|
|
865 sample_table_data_frame$user_specimen_id[i] <- sample_prep_data_frame$user_specimen_id[i];
|
|
866 sample_table_data_frame$registry_id[i] <- sample_prep_data_frame$registry_id[i];
|
|
867 sample_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i];
|
|
868 sample_table_data_frame$dna_extraction_method[i] <- sample_prep_data_frame$dna_extraction_method[i];
|
|
869 sample_table_data_frame$dna_concentration[i] <- sample_prep_data_frame$dna_concentration[i];
|
|
870 sample_table_data_frame$public[i] <- sample_prep_data_frame$public[i];
|
|
871 sample_table_data_frame$public_after_date[i] <- sample_prep_data_frame$public_after_date[i];
|
|
872 sample_table_data_frame$percent_missing_data_coral[i] <- sample_prep_data_frame$percent_missing_data_coral[i];
|
|
873 sample_table_data_frame$percent_missing_data_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
|
1
|
874 sample_table_data_frame$percent_acerv_coral[i] <- sample_prep_data_frame$percent_acerv_coral[i];
|
0
|
875 sample_table_data_frame$percent_reference_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
|
1
|
876 sample_table_data_frame$percent_apalm_coral[i] <- sample_prep_data_frame$percent_apalm_coral[i];
|
0
|
877 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
|
|
878 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i];
|
|
879 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
|
|
880 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i];
|
|
881 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i];
|
|
882 }
|
|
883 write_data_frame(output_data_dir, "sample.tabular", sample_table_data_frame);
|
|
884
|
|
885 # Output the file needed for populating the taxonomy table.
|
|
886 # taxonomy_table_data_frame looks like this:
|
|
887 # genetic_coral_species_call affy_id genus_name species_name
|
|
888 # A.palmata a550962-4368120-060520-500_A05.CEL Acropora palmata
|
|
889 taxonomy_table_data_frame <- stag_db_report %>%
|
|
890 select(genetic_coral_species_call, affy_id) %>%
|
2
|
891 mutate(genus_name = ifelse(grepl("^A.*", genetic_coral_species_call), "Acropora",ifelse(!is.na(genetic_coral_species_call), "other", NA))) %>%
|
0
|
892 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
|
|
893 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
|
|
894 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
|
|
895 colnames(taxonomy_table_data_frame) <- c("genetic_coral_species_call", "affy_id", "genus_name", "species_name");
|
|
896 write_data_frame(output_data_dir, "taxonomy.tabular", taxonomy_table_data_frame);
|
|
897 time_elapsed(start_time);
|
4
|
898
|