Mercurial > repos > petr-novak > re_utils
comparison rmsk_summary_table_multiple.r @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | 62fefa284036 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4cd8608ef6b |
---|---|
1 #! /usr/bin/env Rscript | |
2 # repeat masker summary | |
3 # analysis of *.out file | |
4 # input arguments: <rmsk_result.out> <reads.fas. | |
5 # calculates totoal proportion of matched repetetive families and averagee score and print the table | |
6 suppressPackageStartupMessages(library(getopt)) | |
7 | |
8 # INPUT ARGUMENTS: | |
9 opt = getopt(matrix(c( | |
10 'fasta', 'f', 1, "character", | |
11 'repeat_masker','r', 1, "character", | |
12 'output', 'o', 1, "character", | |
13 'help','h',0,'logical'),ncol=4,byrow=T)) | |
14 | |
15 if (!is.null(opt$help)) { | |
16 cat("Usage:\n\n -f fasta file \n -r repeat masker output\n -o output files\n") | |
17 stop() | |
18 } | |
19 | |
20 RMfiles=system(paste("ls ",opt$repeat_masker,sep=""),intern=T) | |
21 fasta=system(paste("ls ",opt$fasta,sep=""),intern=T) | |
22 | |
23 | |
24 | |
25 getsubdir=function(x){ | |
26 xx=gsub(".*/","",x) | |
27 sdir=gsub(xx,"",x,fixed=T) | |
28 sdir | |
29 } | |
30 | |
31 fasta.summary=function(reads){ | |
32 suppressPackageStartupMessages(library(Biostrings,quiet=T)) | |
33 if (exists("readDNAStringSet")){ | |
34 seqs=readDNAStringSet(reads) | |
35 }else{ | |
36 seqs=read.DNAStringSet(reads) | |
37 } | |
38 | |
39 N=length(seqs) | |
40 Ls=nchar(seqs) | |
41 L=sum(Ls) | |
42 M=median(Ls) | |
43 A=mean(Ls) | |
44 output=c(N,L,M,A) | |
45 names(output)=c("N","length","median","mean") | |
46 output | |
47 } | |
48 | |
49 repCont=function(repname){ # uses external variable! | |
50 pos=repname==classfamily | |
51 RC=unname(sum(lengths[pos])/totalLength) | |
52 RC | |
53 } | |
54 repScore=function(repname){ # uses external variable! | |
55 pos=repname==classfamily | |
56 S=unname(mean(score[pos])) | |
57 S | |
58 } | |
59 | |
60 N=length(RMfiles) | |
61 | |
62 for (i in seq_along(RMfiles)){ | |
63 cat(paste((RMfiles[i]),"\n")) | |
64 | |
65 sdir=getsubdir(RMfiles[i]) | |
66 seqs.summary=fasta.summary(fasta[i]) | |
67 totalLength=seqs.summary["length"] | |
68 | |
69 total.counts=data.frame(c("All_Reads_Length","All_Reads_Number"),c(NA,NA),c(NA,NA),c(totalLength,seqs.summary['N']),c(totalLength,seqs.summary['N']),c(NA,NA)) | |
70 colnames(total.counts)=c("class/fammily","class","family","hits","content","Mean_Score") | |
71 | |
72 | |
73 # get RM data if not empty | |
74 rmsk_empty=length(grep("There were no repetitive sequences detected",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0 | |
75 rmsk_notperformed = length(grep("No Repbase used with RepetMasker",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0 | |
76 if (!rmsk_empty & !rmsk_notperformed) { | |
77 rmsk=read.table(RMfiles[i],header=F,as.is=T,skip=2,comment.char="*",fill=T) | |
78 score=rmsk[,1] | |
79 Ids=rmsk[,5] | |
80 starts=rmsk[,6] | |
81 ends=rmsk[,7] | |
82 lengths=ends-starts | |
83 classfamily=rmsk[,11] | |
84 out.table=table(classfamily) | |
85 out.table=data.frame(names(out.table),c(out.table),stringsAsFactors=F) | |
86 contents=sapply(out.table[,1],repCont)*100 | |
87 meanScore=sapply(out.table[,1],repScore) | |
88 class=gsub("/.*","",out.table[,1]) | |
89 families=gsub(".*/","",out.table[,1]) | |
90 out.table=data.frame(out.table[,1],class,families,out.table[,-1],contents,meanScore,stringsAsFactors=F) | |
91 colnames(out.table)=c("class/fammily","class","family","hits","content","Mean_Score") | |
92 out.table=out.table[order(out.table$content,decreasing=T),] | |
93 out.table=rbind(out.table,total.counts) | |
94 }else{ | |
95 out.table=total.counts | |
96 } | |
97 out.table=out.table[order(out.table$content,decreasing=T),] | |
98 | |
99 write.table(out.table,file=paste(sdir,opt$output,sep=""),row.names=F,quote=F,sep="\t") | |
100 | |
101 | |
102 # merge all tables: | |
103 | |
104 colnames(out.table)=paste(colnames(out.table),c("",rep(sdir,5))) | |
105 | |
106 if (i==1) { | |
107 mergedTable=out.table[,c(1,4,5,6)] | |
108 }else{ | |
109 | |
110 mergedTable=merge(mergedTable,out.table[,c(1,4,5,6)],by=1,all=T) | |
111 } | |
112 | |
113 } | |
114 mergedTable=mergedTable[order(rowSums(mergedTable[,seq(2,N*3,by=3),drop=FALSE],na.rm=T),decreasing=T),] | |
115 mergedTable[is.na(mergedTable)]<-0 | |
116 | |
117 write.table(t(mergedTable),file=paste(opt$output,"summary.csv",sep=""),col.names=F,row.names=T,quote=F,sep="\t") | |
118 | |
119 #save.image("tmp.RData") | |
120 | |
121 | |
122 | |
123 | |
124 | |
125 | |
126 | |
127 |