| 0 | 1 #!/usr/bin/env Rscript | 
|  | 2 ## input 1 - fasta with CLXContig names | 
|  | 3 ## input 2 - annotation | 
|  | 4 ## output 3 - annotated fasta | 
|  | 5 suppressPackageStartupMessages(library(Biostrings)) | 
|  | 6 | 
| 1 | 7 clean_contigs = function(s) { | 
| 0 | 8   ## remove all N | 
|  | 9   sc = as.character(s) | 
| 1 | 10   sc_trimmed = gsub("N+$", "", gsub("^N+", "", s)) | 
| 0 | 11   ## remove blank and short sequences: | 
| 1 | 12   sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0] | 
|  | 13   sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20] | 
|  | 14   sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20] | 
|  | 15   sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)] | 
| 0 | 16   sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) | 
|  | 17 } | 
|  | 18 | 
|  | 19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta | 
|  | 20 ## input 1 - input.fasta - contigs from clustering | 
|  | 21 ## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation | 
|  | 22 ## | 
|  | 23 ## output - clean conntigs with appended annotation | 
|  | 24 | 
|  | 25 ## find header row of annotation table | 
|  | 26 x = readLines(commandArgs(T)[2]) | 
|  | 27 | 
|  | 28 ## TODO - check mandatory names!!! | 
|  | 29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) | 
|  | 30 message("using line ", hl, " as header") | 
|  | 31 | 
| 1 | 32 annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1) | 
| 0 | 33 colnames(annotation_table) = tolower(colnames(annotation_table)) | 
|  | 34 | 
|  | 35 contigs = readDNAStringSet(commandArgs(T)[1]) | 
|  | 36 | 
|  | 37 | 
| 1 | 38 if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) { | 
| 0 | 39   annot_dict = annotation_table$final_annotation | 
|  | 40   message("using final annotation column") | 
| 1 | 41 }else { | 
| 0 | 42   message("using automatic annotation column") | 
|  | 43   annot_dict = annotation_table$automatic_annotation | 
|  | 44 } | 
|  | 45 | 
|  | 46 | 
| 1 | 47 names(annot_dict) = paste0("CL", annotation_table$cluster) | 
| 0 | 48 #print(annot_dict) | 
|  | 49 | 
|  | 50 contigs_ok = clean_contigs(contigs) | 
| 1 | 51 contig_name = gsub("Contig.+", "", names(contigs_ok)) | 
| 0 | 52 | 
|  | 53 ## keep only contigs which are in annot table | 
|  | 54 include = contig_name %in% names(annot_dict) | 
|  | 55 | 
|  | 56 contig_name_inc = contig_name[include] | 
|  | 57 contig_ok_include = contigs_ok[include] | 
|  | 58 | 
| 1 | 59 new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc]) | 
| 0 | 60 names(contig_ok_include) = new_name_with_annot | 
|  | 61 | 
|  | 62 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) |