Mercurial > repos > petr-novak > repeat_annotation_pipeline3
comparison annotate_contigs.R @ 0:ea6a3059a6af draft
Uploaded
| author | petr-novak | 
|---|---|
| date | Mon, 18 Oct 2021 11:01:20 +0000 | 
| parents | |
| children | 814cba36e435 | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:ea6a3059a6af | 
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ## input 1 - fasta with CLXContig names | |
| 3 ## input 2 - annotation | |
| 4 ## output 3 - annotated fasta | |
| 5 suppressPackageStartupMessages(library(Biostrings)) | |
| 6 | |
| 7 clean_contigs = function(s){ | |
| 8 ## remove all N | |
| 9 sc = as.character(s) | |
| 10 sc_trimmed = gsub("N+$", "", gsub("^N+","",s)) | |
| 11 ## remove blank and short sequences: | |
| 12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0] | |
| 13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20] | |
| 14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20] | |
| 15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)] | |
| 16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) | |
| 17 } | |
| 18 | |
| 19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta | |
| 20 ## input 1 - input.fasta - contigs from clustering | |
| 21 ## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation | |
| 22 ## | |
| 23 ## output - clean conntigs with appended annotation | |
| 24 | |
| 25 ## find header row of annotation table | |
| 26 x = readLines(commandArgs(T)[2]) | |
| 27 | |
| 28 ## TODO - check mandatory names!!! | |
| 29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) | |
| 30 message("using line ", hl, " as header") | |
| 31 | |
| 32 annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1) | |
| 33 colnames(annotation_table) = tolower(colnames(annotation_table)) | |
| 34 | |
| 35 contigs = readDNAStringSet(commandArgs(T)[1]) | |
| 36 | |
| 37 | |
| 38 if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){ | |
| 39 annot_dict = annotation_table$final_annotation | |
| 40 message("using final annotation column") | |
| 41 }else{ | |
| 42 message("using automatic annotation column") | |
| 43 annot_dict = annotation_table$automatic_annotation | |
| 44 } | |
| 45 | |
| 46 | |
| 47 | |
| 48 names(annot_dict) = paste0("CL",annotation_table$cluster) | |
| 49 #print(annot_dict) | |
| 50 | |
| 51 contigs_ok = clean_contigs(contigs) | |
| 52 contig_name = gsub("Contig.+","",names(contigs_ok)) | |
| 53 | |
| 54 ## keep only contigs which are in annot table | |
| 55 include = contig_name %in% names(annot_dict) | |
| 56 | |
| 57 contig_name_inc = contig_name[include] | |
| 58 contig_ok_include = contigs_ok[include] | |
| 59 | |
| 60 new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc]) | |
| 61 names(contig_ok_include) = new_name_with_annot | |
| 62 | |
| 63 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) | 
