Mercurial > repos > chrisd > testing
diff gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip
planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author | chrisd |
---|---|
date | Sun, 21 Feb 2016 23:31:55 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/SamRatio.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,248 @@ +#ifndef SAM_RATIO_H +#define SAM_RATIO_H + +#include <iostream> +#include <fstream> +#include <algorithm> + +#include "FastaRecord.h" +#include "Alignments.h" +#include "args.h" + +typedef std::vector<std::pair<int,char>> cigar_str; + +/** + * + */ +struct header { + std::string level = "Level"; + std::string iteration = "Iteration"; + std::string gene_id = "Gene id"; + std::string gene_fraction = "Gene Fraction"; + std::string hits = "Hits"; +}; + +/** + * Reports the total number of bases that were touched for each + * gene. This is largely calculated using the positional and seq + * information found in fields four and ten of each alignment + */ +void analyze_coverage(FastaRecord &record, Alignments &alignment) { + record.update_gene_hits(); + + cigar_str cigar = alignment.cigar(); + + int len; + char op; + + int occurrence; + int pos_in_gene = alignment.pos(); + + int start, stop; + int base_hits = record._base_hits.size(); // func this + int read_length = alignment.seq().length(); //func this + + if(pos_in_gene == 1) { + occurrence = 0; + for(int i = 0; i < cigar.size(); i++) { + len = cigar[i].first; + op = cigar[i].second; + + switch(op) { + case 'M': + occurrence += len; + break; + case 'I': + occurrence += len; + break; + default: + break; + } + } + + start = read_length - occurrence; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } + else { + start = pos_in_gene - 1; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } +} + +/** + * Returns gene fraction of fasta record + * Returns -1 if gene fraction is not greater than threshold + */ +double coverage(const FastaRecord &record, const int &threshold) { + double gene_coverage; + + int base_hits, gene_size; + + base_hits = record.get_base_hits(); + gene_size = record.gene().length(); + + gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; + + if(gene_coverage > threshold) + return gene_coverage; + return -1; +} + +/** + * Writes header to output file when + * reading from stdin + */ +void bam_stream_header() { + header h; + char sep = ','; + + std::cout << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; +} + +/** + * Writes header to output file when + * reading from sam file + */ +void file_header(const std::string &out_fp, const std::string &sam_fp) { + header h; + std::ofstream out(out_fp.c_str(), std::ofstream::app ); + char sep = ','; + + //out << "@" << sam_fp << '\n'; + out << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; + out.close(); +} + +/** + * + */ +void create_header(cmd_args &args) { + if(args.bam_stream) { + bam_stream_header(); + } + else { + file_header(args.out_fp, args.sam_fp); + } +} + +/** + * Writes results to output file when reading from + * stdin + */ +void bam_stream_results(std::vector<FastaRecord> &records, + const int &level, const int &iteration, + cmd_args &args) { + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + std::cout << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } +} + +/** + * Write results when reading sam file from + * path + */ +void file_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + std::ofstream out(args.out_fp.c_str(), std::ofstream::app); + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + out << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } + out.close(); +} + +/** + * + */ +void report_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + if(args.bam_stream) { + bam_stream_results(records,level,iteration,args); + } + else { + file_results(records,level,iteration,args); + } +} + +/** + * Generates a sequence of samples from sam file specified + * by the starting level, max level, and skip pattern + */ +void generate_samples(std::vector<FastaRecord> &records, + std::vector<Alignments> &alignments, + cmd_args &args) { + + int read_count = alignments.size(); + int sample_size; + + srand(unsigned(time(0))); + + std::vector<int> sequence(read_count); + iota(sequence.begin(), sequence.end(), 0); + + create_header(args); + + for(int level = args.min; level <= args.max; level += args.skip) { + for(int sample = 0; sample < args.s_per_lev; sample++) { + random_shuffle(sequence.begin(), sequence.end(), randomize); + sample_size = round(((static_cast<double>(level)/100)*read_count)); + std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); + + for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { + std::string rname = alignments[chosen[a_idx]].rname(); + int gene_idx = FastaRecord::find_gene(records, rname); + analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); + } + report_results(records,level,sample+1,args); + FastaRecord::reset_base_hits(records); + FastaRecord::reset_gene_hits(records); + } + } +} + +#endif /* SAM_RATIO_H */