Mercurial > repos > chrisd > testing
comparison gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip
planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
| author | chrisd |
|---|---|
| date | Sun, 21 Feb 2016 23:31:55 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f95150c37d38 |
|---|---|
| 1 #ifndef SAM_RATIO_H | |
| 2 #define SAM_RATIO_H | |
| 3 | |
| 4 #include <iostream> | |
| 5 #include <fstream> | |
| 6 #include <algorithm> | |
| 7 | |
| 8 #include "FastaRecord.h" | |
| 9 #include "Alignments.h" | |
| 10 #include "args.h" | |
| 11 | |
| 12 typedef std::vector<std::pair<int,char>> cigar_str; | |
| 13 | |
| 14 /** | |
| 15 * | |
| 16 */ | |
| 17 struct header { | |
| 18 std::string level = "Level"; | |
| 19 std::string iteration = "Iteration"; | |
| 20 std::string gene_id = "Gene id"; | |
| 21 std::string gene_fraction = "Gene Fraction"; | |
| 22 std::string hits = "Hits"; | |
| 23 }; | |
| 24 | |
| 25 /** | |
| 26 * Reports the total number of bases that were touched for each | |
| 27 * gene. This is largely calculated using the positional and seq | |
| 28 * information found in fields four and ten of each alignment | |
| 29 */ | |
| 30 void analyze_coverage(FastaRecord &record, Alignments &alignment) { | |
| 31 record.update_gene_hits(); | |
| 32 | |
| 33 cigar_str cigar = alignment.cigar(); | |
| 34 | |
| 35 int len; | |
| 36 char op; | |
| 37 | |
| 38 int occurrence; | |
| 39 int pos_in_gene = alignment.pos(); | |
| 40 | |
| 41 int start, stop; | |
| 42 int base_hits = record._base_hits.size(); // func this | |
| 43 int read_length = alignment.seq().length(); //func this | |
| 44 | |
| 45 if(pos_in_gene == 1) { | |
| 46 occurrence = 0; | |
| 47 for(int i = 0; i < cigar.size(); i++) { | |
| 48 len = cigar[i].first; | |
| 49 op = cigar[i].second; | |
| 50 | |
| 51 switch(op) { | |
| 52 case 'M': | |
| 53 occurrence += len; | |
| 54 break; | |
| 55 case 'I': | |
| 56 occurrence += len; | |
| 57 break; | |
| 58 default: | |
| 59 break; | |
| 60 } | |
| 61 } | |
| 62 | |
| 63 start = read_length - occurrence; | |
| 64 stop = start + read_length; | |
| 65 | |
| 66 for(int i = start; i < base_hits; i++) { | |
| 67 if(i == stop) break; | |
| 68 record._base_hits[i] = 1; | |
| 69 } | |
| 70 } | |
| 71 else { | |
| 72 start = pos_in_gene - 1; | |
| 73 stop = start + read_length; | |
| 74 | |
| 75 for(int i = start; i < base_hits; i++) { | |
| 76 if(i == stop) break; | |
| 77 record._base_hits[i] = 1; | |
| 78 } | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 /** | |
| 83 * Returns gene fraction of fasta record | |
| 84 * Returns -1 if gene fraction is not greater than threshold | |
| 85 */ | |
| 86 double coverage(const FastaRecord &record, const int &threshold) { | |
| 87 double gene_coverage; | |
| 88 | |
| 89 int base_hits, gene_size; | |
| 90 | |
| 91 base_hits = record.get_base_hits(); | |
| 92 gene_size = record.gene().length(); | |
| 93 | |
| 94 gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; | |
| 95 | |
| 96 if(gene_coverage > threshold) | |
| 97 return gene_coverage; | |
| 98 return -1; | |
| 99 } | |
| 100 | |
| 101 /** | |
| 102 * Writes header to output file when | |
| 103 * reading from stdin | |
| 104 */ | |
| 105 void bam_stream_header() { | |
| 106 header h; | |
| 107 char sep = ','; | |
| 108 | |
| 109 std::cout << h.level << sep << h.iteration << sep | |
| 110 << h.gene_id << sep << h.gene_fraction << sep | |
| 111 << h.hits << '\n'; | |
| 112 } | |
| 113 | |
| 114 /** | |
| 115 * Writes header to output file when | |
| 116 * reading from sam file | |
| 117 */ | |
| 118 void file_header(const std::string &out_fp, const std::string &sam_fp) { | |
| 119 header h; | |
| 120 std::ofstream out(out_fp.c_str(), std::ofstream::app ); | |
| 121 char sep = ','; | |
| 122 | |
| 123 //out << "@" << sam_fp << '\n'; | |
| 124 out << h.level << sep << h.iteration << sep | |
| 125 << h.gene_id << sep << h.gene_fraction << sep | |
| 126 << h.hits << '\n'; | |
| 127 out.close(); | |
| 128 } | |
| 129 | |
| 130 /** | |
| 131 * | |
| 132 */ | |
| 133 void create_header(cmd_args &args) { | |
| 134 if(args.bam_stream) { | |
| 135 bam_stream_header(); | |
| 136 } | |
| 137 else { | |
| 138 file_header(args.out_fp, args.sam_fp); | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 /** | |
| 143 * Writes results to output file when reading from | |
| 144 * stdin | |
| 145 */ | |
| 146 void bam_stream_results(std::vector<FastaRecord> &records, | |
| 147 const int &level, const int &iteration, | |
| 148 cmd_args &args) { | |
| 149 | |
| 150 double gene_fraction; | |
| 151 int hits_seen; | |
| 152 std::string id; | |
| 153 char sep = ','; | |
| 154 | |
| 155 for(auto &rec : records) { | |
| 156 gene_fraction = coverage(rec, args.threshold); | |
| 157 hits_seen = rec.gene_hits(); | |
| 158 id = rec.gene_id(); | |
| 159 | |
| 160 if(gene_fraction > 0) { | |
| 161 std::cout << level << sep << iteration << sep | |
| 162 << id << sep << gene_fraction << sep | |
| 163 << hits_seen << '\n'; | |
| 164 } | |
| 165 } | |
| 166 } | |
| 167 | |
| 168 /** | |
| 169 * Write results when reading sam file from | |
| 170 * path | |
| 171 */ | |
| 172 void file_results(std::vector<FastaRecord> &records, | |
| 173 const int level, const int &iteration, | |
| 174 cmd_args &args) { | |
| 175 | |
| 176 std::ofstream out(args.out_fp.c_str(), std::ofstream::app); | |
| 177 | |
| 178 double gene_fraction; | |
| 179 int hits_seen; | |
| 180 std::string id; | |
| 181 char sep = ','; | |
| 182 | |
| 183 for(auto &rec : records) { | |
| 184 gene_fraction = coverage(rec, args.threshold); | |
| 185 hits_seen = rec.gene_hits(); | |
| 186 id = rec.gene_id(); | |
| 187 | |
| 188 if(gene_fraction > 0) { | |
| 189 out << level << sep << iteration << sep | |
| 190 << id << sep << gene_fraction << sep | |
| 191 << hits_seen << '\n'; | |
| 192 } | |
| 193 } | |
| 194 out.close(); | |
| 195 } | |
| 196 | |
| 197 /** | |
| 198 * | |
| 199 */ | |
| 200 void report_results(std::vector<FastaRecord> &records, | |
| 201 const int level, const int &iteration, | |
| 202 cmd_args &args) { | |
| 203 | |
| 204 if(args.bam_stream) { | |
| 205 bam_stream_results(records,level,iteration,args); | |
| 206 } | |
| 207 else { | |
| 208 file_results(records,level,iteration,args); | |
| 209 } | |
| 210 } | |
| 211 | |
| 212 /** | |
| 213 * Generates a sequence of samples from sam file specified | |
| 214 * by the starting level, max level, and skip pattern | |
| 215 */ | |
| 216 void generate_samples(std::vector<FastaRecord> &records, | |
| 217 std::vector<Alignments> &alignments, | |
| 218 cmd_args &args) { | |
| 219 | |
| 220 int read_count = alignments.size(); | |
| 221 int sample_size; | |
| 222 | |
| 223 srand(unsigned(time(0))); | |
| 224 | |
| 225 std::vector<int> sequence(read_count); | |
| 226 iota(sequence.begin(), sequence.end(), 0); | |
| 227 | |
| 228 create_header(args); | |
| 229 | |
| 230 for(int level = args.min; level <= args.max; level += args.skip) { | |
| 231 for(int sample = 0; sample < args.s_per_lev; sample++) { | |
| 232 random_shuffle(sequence.begin(), sequence.end(), randomize); | |
| 233 sample_size = round(((static_cast<double>(level)/100)*read_count)); | |
| 234 std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); | |
| 235 | |
| 236 for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { | |
| 237 std::string rname = alignments[chosen[a_idx]].rname(); | |
| 238 int gene_idx = FastaRecord::find_gene(records, rname); | |
| 239 analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); | |
| 240 } | |
| 241 report_results(records,level,sample+1,args); | |
| 242 FastaRecord::reset_base_hits(records); | |
| 243 FastaRecord::reset_gene_hits(records); | |
| 244 } | |
| 245 } | |
| 246 } | |
| 247 | |
| 248 #endif /* SAM_RATIO_H */ |
