Mercurial > repos > chrisd > testing
view gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip
planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author | chrisd |
---|---|
date | Sun, 21 Feb 2016 23:31:55 -0500 |
parents | |
children |
line wrap: on
line source
#ifndef SAM_RATIO_H #define SAM_RATIO_H #include <iostream> #include <fstream> #include <algorithm> #include "FastaRecord.h" #include "Alignments.h" #include "args.h" typedef std::vector<std::pair<int,char>> cigar_str; /** * */ struct header { std::string level = "Level"; std::string iteration = "Iteration"; std::string gene_id = "Gene id"; std::string gene_fraction = "Gene Fraction"; std::string hits = "Hits"; }; /** * Reports the total number of bases that were touched for each * gene. This is largely calculated using the positional and seq * information found in fields four and ten of each alignment */ void analyze_coverage(FastaRecord &record, Alignments &alignment) { record.update_gene_hits(); cigar_str cigar = alignment.cigar(); int len; char op; int occurrence; int pos_in_gene = alignment.pos(); int start, stop; int base_hits = record._base_hits.size(); // func this int read_length = alignment.seq().length(); //func this if(pos_in_gene == 1) { occurrence = 0; for(int i = 0; i < cigar.size(); i++) { len = cigar[i].first; op = cigar[i].second; switch(op) { case 'M': occurrence += len; break; case 'I': occurrence += len; break; default: break; } } start = read_length - occurrence; stop = start + read_length; for(int i = start; i < base_hits; i++) { if(i == stop) break; record._base_hits[i] = 1; } } else { start = pos_in_gene - 1; stop = start + read_length; for(int i = start; i < base_hits; i++) { if(i == stop) break; record._base_hits[i] = 1; } } } /** * Returns gene fraction of fasta record * Returns -1 if gene fraction is not greater than threshold */ double coverage(const FastaRecord &record, const int &threshold) { double gene_coverage; int base_hits, gene_size; base_hits = record.get_base_hits(); gene_size = record.gene().length(); gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; if(gene_coverage > threshold) return gene_coverage; return -1; } /** * Writes header to output file when * reading from stdin */ void bam_stream_header() { header h; char sep = ','; std::cout << h.level << sep << h.iteration << sep << h.gene_id << sep << h.gene_fraction << sep << h.hits << '\n'; } /** * Writes header to output file when * reading from sam file */ void file_header(const std::string &out_fp, const std::string &sam_fp) { header h; std::ofstream out(out_fp.c_str(), std::ofstream::app ); char sep = ','; //out << "@" << sam_fp << '\n'; out << h.level << sep << h.iteration << sep << h.gene_id << sep << h.gene_fraction << sep << h.hits << '\n'; out.close(); } /** * */ void create_header(cmd_args &args) { if(args.bam_stream) { bam_stream_header(); } else { file_header(args.out_fp, args.sam_fp); } } /** * Writes results to output file when reading from * stdin */ void bam_stream_results(std::vector<FastaRecord> &records, const int &level, const int &iteration, cmd_args &args) { double gene_fraction; int hits_seen; std::string id; char sep = ','; for(auto &rec : records) { gene_fraction = coverage(rec, args.threshold); hits_seen = rec.gene_hits(); id = rec.gene_id(); if(gene_fraction > 0) { std::cout << level << sep << iteration << sep << id << sep << gene_fraction << sep << hits_seen << '\n'; } } } /** * Write results when reading sam file from * path */ void file_results(std::vector<FastaRecord> &records, const int level, const int &iteration, cmd_args &args) { std::ofstream out(args.out_fp.c_str(), std::ofstream::app); double gene_fraction; int hits_seen; std::string id; char sep = ','; for(auto &rec : records) { gene_fraction = coverage(rec, args.threshold); hits_seen = rec.gene_hits(); id = rec.gene_id(); if(gene_fraction > 0) { out << level << sep << iteration << sep << id << sep << gene_fraction << sep << hits_seen << '\n'; } } out.close(); } /** * */ void report_results(std::vector<FastaRecord> &records, const int level, const int &iteration, cmd_args &args) { if(args.bam_stream) { bam_stream_results(records,level,iteration,args); } else { file_results(records,level,iteration,args); } } /** * Generates a sequence of samples from sam file specified * by the starting level, max level, and skip pattern */ void generate_samples(std::vector<FastaRecord> &records, std::vector<Alignments> &alignments, cmd_args &args) { int read_count = alignments.size(); int sample_size; srand(unsigned(time(0))); std::vector<int> sequence(read_count); iota(sequence.begin(), sequence.end(), 0); create_header(args); for(int level = args.min; level <= args.max; level += args.skip) { for(int sample = 0; sample < args.s_per_lev; sample++) { random_shuffle(sequence.begin(), sequence.end(), randomize); sample_size = round(((static_cast<double>(level)/100)*read_count)); std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { std::string rname = alignments[chosen[a_idx]].rname(); int gene_idx = FastaRecord::find_gene(records, rname); analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); } report_results(records,level,sample+1,args); FastaRecord::reset_base_hits(records); FastaRecord::reset_gene_hits(records); } } } #endif /* SAM_RATIO_H */