Mercurial > repos > chrisd > testing
comparison gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip
planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author | chrisd |
---|---|
date | Sun, 21 Feb 2016 23:31:55 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f95150c37d38 |
---|---|
1 #ifndef SAM_RATIO_H | |
2 #define SAM_RATIO_H | |
3 | |
4 #include <iostream> | |
5 #include <fstream> | |
6 #include <algorithm> | |
7 | |
8 #include "FastaRecord.h" | |
9 #include "Alignments.h" | |
10 #include "args.h" | |
11 | |
12 typedef std::vector<std::pair<int,char>> cigar_str; | |
13 | |
14 /** | |
15 * | |
16 */ | |
17 struct header { | |
18 std::string level = "Level"; | |
19 std::string iteration = "Iteration"; | |
20 std::string gene_id = "Gene id"; | |
21 std::string gene_fraction = "Gene Fraction"; | |
22 std::string hits = "Hits"; | |
23 }; | |
24 | |
25 /** | |
26 * Reports the total number of bases that were touched for each | |
27 * gene. This is largely calculated using the positional and seq | |
28 * information found in fields four and ten of each alignment | |
29 */ | |
30 void analyze_coverage(FastaRecord &record, Alignments &alignment) { | |
31 record.update_gene_hits(); | |
32 | |
33 cigar_str cigar = alignment.cigar(); | |
34 | |
35 int len; | |
36 char op; | |
37 | |
38 int occurrence; | |
39 int pos_in_gene = alignment.pos(); | |
40 | |
41 int start, stop; | |
42 int base_hits = record._base_hits.size(); // func this | |
43 int read_length = alignment.seq().length(); //func this | |
44 | |
45 if(pos_in_gene == 1) { | |
46 occurrence = 0; | |
47 for(int i = 0; i < cigar.size(); i++) { | |
48 len = cigar[i].first; | |
49 op = cigar[i].second; | |
50 | |
51 switch(op) { | |
52 case 'M': | |
53 occurrence += len; | |
54 break; | |
55 case 'I': | |
56 occurrence += len; | |
57 break; | |
58 default: | |
59 break; | |
60 } | |
61 } | |
62 | |
63 start = read_length - occurrence; | |
64 stop = start + read_length; | |
65 | |
66 for(int i = start; i < base_hits; i++) { | |
67 if(i == stop) break; | |
68 record._base_hits[i] = 1; | |
69 } | |
70 } | |
71 else { | |
72 start = pos_in_gene - 1; | |
73 stop = start + read_length; | |
74 | |
75 for(int i = start; i < base_hits; i++) { | |
76 if(i == stop) break; | |
77 record._base_hits[i] = 1; | |
78 } | |
79 } | |
80 } | |
81 | |
82 /** | |
83 * Returns gene fraction of fasta record | |
84 * Returns -1 if gene fraction is not greater than threshold | |
85 */ | |
86 double coverage(const FastaRecord &record, const int &threshold) { | |
87 double gene_coverage; | |
88 | |
89 int base_hits, gene_size; | |
90 | |
91 base_hits = record.get_base_hits(); | |
92 gene_size = record.gene().length(); | |
93 | |
94 gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; | |
95 | |
96 if(gene_coverage > threshold) | |
97 return gene_coverage; | |
98 return -1; | |
99 } | |
100 | |
101 /** | |
102 * Writes header to output file when | |
103 * reading from stdin | |
104 */ | |
105 void bam_stream_header() { | |
106 header h; | |
107 char sep = ','; | |
108 | |
109 std::cout << h.level << sep << h.iteration << sep | |
110 << h.gene_id << sep << h.gene_fraction << sep | |
111 << h.hits << '\n'; | |
112 } | |
113 | |
114 /** | |
115 * Writes header to output file when | |
116 * reading from sam file | |
117 */ | |
118 void file_header(const std::string &out_fp, const std::string &sam_fp) { | |
119 header h; | |
120 std::ofstream out(out_fp.c_str(), std::ofstream::app ); | |
121 char sep = ','; | |
122 | |
123 //out << "@" << sam_fp << '\n'; | |
124 out << h.level << sep << h.iteration << sep | |
125 << h.gene_id << sep << h.gene_fraction << sep | |
126 << h.hits << '\n'; | |
127 out.close(); | |
128 } | |
129 | |
130 /** | |
131 * | |
132 */ | |
133 void create_header(cmd_args &args) { | |
134 if(args.bam_stream) { | |
135 bam_stream_header(); | |
136 } | |
137 else { | |
138 file_header(args.out_fp, args.sam_fp); | |
139 } | |
140 } | |
141 | |
142 /** | |
143 * Writes results to output file when reading from | |
144 * stdin | |
145 */ | |
146 void bam_stream_results(std::vector<FastaRecord> &records, | |
147 const int &level, const int &iteration, | |
148 cmd_args &args) { | |
149 | |
150 double gene_fraction; | |
151 int hits_seen; | |
152 std::string id; | |
153 char sep = ','; | |
154 | |
155 for(auto &rec : records) { | |
156 gene_fraction = coverage(rec, args.threshold); | |
157 hits_seen = rec.gene_hits(); | |
158 id = rec.gene_id(); | |
159 | |
160 if(gene_fraction > 0) { | |
161 std::cout << level << sep << iteration << sep | |
162 << id << sep << gene_fraction << sep | |
163 << hits_seen << '\n'; | |
164 } | |
165 } | |
166 } | |
167 | |
168 /** | |
169 * Write results when reading sam file from | |
170 * path | |
171 */ | |
172 void file_results(std::vector<FastaRecord> &records, | |
173 const int level, const int &iteration, | |
174 cmd_args &args) { | |
175 | |
176 std::ofstream out(args.out_fp.c_str(), std::ofstream::app); | |
177 | |
178 double gene_fraction; | |
179 int hits_seen; | |
180 std::string id; | |
181 char sep = ','; | |
182 | |
183 for(auto &rec : records) { | |
184 gene_fraction = coverage(rec, args.threshold); | |
185 hits_seen = rec.gene_hits(); | |
186 id = rec.gene_id(); | |
187 | |
188 if(gene_fraction > 0) { | |
189 out << level << sep << iteration << sep | |
190 << id << sep << gene_fraction << sep | |
191 << hits_seen << '\n'; | |
192 } | |
193 } | |
194 out.close(); | |
195 } | |
196 | |
197 /** | |
198 * | |
199 */ | |
200 void report_results(std::vector<FastaRecord> &records, | |
201 const int level, const int &iteration, | |
202 cmd_args &args) { | |
203 | |
204 if(args.bam_stream) { | |
205 bam_stream_results(records,level,iteration,args); | |
206 } | |
207 else { | |
208 file_results(records,level,iteration,args); | |
209 } | |
210 } | |
211 | |
212 /** | |
213 * Generates a sequence of samples from sam file specified | |
214 * by the starting level, max level, and skip pattern | |
215 */ | |
216 void generate_samples(std::vector<FastaRecord> &records, | |
217 std::vector<Alignments> &alignments, | |
218 cmd_args &args) { | |
219 | |
220 int read_count = alignments.size(); | |
221 int sample_size; | |
222 | |
223 srand(unsigned(time(0))); | |
224 | |
225 std::vector<int> sequence(read_count); | |
226 iota(sequence.begin(), sequence.end(), 0); | |
227 | |
228 create_header(args); | |
229 | |
230 for(int level = args.min; level <= args.max; level += args.skip) { | |
231 for(int sample = 0; sample < args.s_per_lev; sample++) { | |
232 random_shuffle(sequence.begin(), sequence.end(), randomize); | |
233 sample_size = round(((static_cast<double>(level)/100)*read_count)); | |
234 std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); | |
235 | |
236 for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { | |
237 std::string rname = alignments[chosen[a_idx]].rname(); | |
238 int gene_idx = FastaRecord::find_gene(records, rname); | |
239 analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); | |
240 } | |
241 report_results(records,level,sample+1,args); | |
242 FastaRecord::reset_base_hits(records); | |
243 FastaRecord::reset_gene_hits(records); | |
244 } | |
245 } | |
246 } | |
247 | |
248 #endif /* SAM_RATIO_H */ |