view _modules/IData_handling.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
line wrap: on
line source

## @file IData_handling.py
#
# VL: Gather here the classes and functions useful for handling input data.
from __future__ import print_function

import gzip
from _modules.utilities import reverseComplement,changeCase
from time import gmtime, strftime
import datetime

try:
    import cPickle as pickle
except ImportError:  # python 3.x
    import pickle


## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences.
#
# It is used both for searching the read extracts in the sequences and for exploiting the results
class refData:
    def __init__(self,refseq_list,seed,hostseq):
        self.refseq_list=refseq_list
        self.seed=seed
        self.hostseq=hostseq
        if hostseq!="":
            self.refseq_list.insert(0,hostseq)
        self.nb_sequences=len(refseq_list)

    def getIdxSeq(self,refseq):
        idx=-1
        found=False
        for s in self.refseq_list:
            idx += 1
            if s==refseq:
                found=True
                break
        if not found:
            raise RuntimeError("Couldn't find sequence in list of ref sequences.")
        return idx


    def IdxIsHostseq(self,idx_seq):
        if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or (
            (self.hostseq != "") and (idx_seq >0))):
            return False
        return True

    def getSeqSizesList(self):
        seq_sizes_list = []
        for seq in self.refseq_list:
            seq_sizes_list.append(len(seq))
        return seq_sizes_list


## Base class for handling read extracts.
#
# This class should not be used directly.
class ReadExtracts:
    def __init__(self,seed):
        self.seed = seed
        self.r_extracts_list = []
        self.nb_reads = 0
        self.nb_extracts=0

    ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts
    def getRExtracts(self):
        return self.r_extracts_list,self.nb_reads,self.nb_extracts

## Class containing all the read extracts (PE reads) that must be mapped against a sequence.
class readExtractsPE(ReadExtracts):
    def __init__(self,seed):
        self.__init__(seed)


    def addRead(self, whole_PE1,whole_PE2):
        self.r_extracts_list.append(whole_PE1[:self.seed])
        self.r_extracts_list.append(whole_PE1[-self.seed:])
        self.r_extracts_list.append(whole_PE2[:self.seed])
        self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed])
        self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:])
        self.nb_reads += 1
        self.nb_extracts += 5  # Number of extracts per read: 2 extracts for PE1 and 3 for PE2.



## Class containing all the read extracts (single reads) that must be mapped against a sequence.
class readsExtractsS(ReadExtracts):
    def __init__(self,seed):
        ReadExtracts.__init__(self,seed)

    ## Adds a read to the list of extracts
    #
    # @param whole_read The read as extracted from the fastq file
    # @param no_pair This paramenter s only used to make the distinction between Single and paired.
    # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance.
    # TODO: test it when all the rest works.
    def addRead(self,whole_read,no_pair=""):
        read_part = whole_read[:self.seed]
        self.r_extracts_list.append(read_part)
        self.r_extracts_list.append(whole_read[-self.seed:])
        self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed])
        self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:])
        self.nb_reads+=1
        self.nb_extracts += 4

## use objects of this class to store read offset (PE1 and PE2) in files.
class ReadInfo:
    def __init__(self, off_PE1,whole_read,seed,off_PE2=None):
        self.offset1=off_PE1
        self.offset2=off_PE2
        self.corlen = len(whole_read) - seed

## Gets the number of reads in the fastq file
# def getNbReads(fastq):
#     with open(fastq) as f:
#         for i, l in enumerate(f):
#             pass
#     nb_r=i+1
#     nb_r=nb_r/4
#     return nb_r



## loads a chunk of reads for mapping on GPU.
# Yields a ReadExtracts object plus a dictionnary of ReadInfo.
# keeps in memory the parsing state of the file.
# @param ch_size is in number of reads
# @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk.
def getChunk(fastq,seed,paired,ch_siz,reset_ids=True):
    new_chunk = False
    d_rinfo=dict()
    idx_read=0
    off2=None
    filin = open(fastq)
    line = filin.readline()
    read_paired=""
    if paired != "":
        RE=readExtractsPE(seed)
        filin_paired = open(paired)
        line_paired = filin_paired.readline()
    else:
        RE=readsExtractsS(seed)

    start = False
    num_line=0
    while line:
        # Read sequence
        read = line.split("\n")[0].split("\r")[0]
        if paired != "":
            read_paired = line_paired.split("\n")[0].split("\r")[0]
        if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
            start = True
            off1=filin.tell()
            line = filin.readline()
            if paired != "":
                off2=filin_paired.tell()
                line_paired = filin_paired.readline()
            continue
        if (start == True):
            start = False
            readlen = len(read)
            if readlen < seed:
                line = filin.readline()
                if paired !="":
                    line_paired = filin_paired.readline() # also skip PE2 in that case
                continue
            RE.addRead(read,read_paired)
            d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
            if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)):
                yield RE,d_rinfo
                if (reset_ids):
                    idx_read=0
                    new_chunk=True
                if paired != "":
                    RE = readExtractsPE(seed)
                else:
                    RE = readsExtractsS(seed)
                d_rinfo = dict()
            if not new_chunk:
                idx_read+=1
            else:
                new_chunk=False

        line = filin.readline()
        if paired!="":
            line_paired = filin_paired.readline()
    filin.close()
    if paired !="":
        filin_paired.close()
    yield RE, d_rinfo

## dumps a dictionnary of ReadInfo objects indexed on read index.
#
# @param d_rinfo dictionnary to dump
# @param fic filename (incl. full path) where to dump
def dump_d_rinfo(d_rinfo,fic):
    with open(fic, 'wb') as fp:
        pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Loads a dictionnary of ReadInfo objects.
def load_d_rinfo(fic):
    with open(fic, 'rb') as fp:
        d_rinfo = pickle.load(fp)
    return d_rinfo


## loads all extracts of reads into a list for processing on GPU.
#
# returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo.
def getAllReads(fastq,seed,paired):
    d_rinfo=dict()
    idx_read=0
    off2=None
    filin = open(fastq)
    line = filin.readline()
    read_paired=""

    if paired != "":
        RE=readExtractsPE(seed)
        filin_paired = open(paired)
        line_paired = filin_paired.readline()
    else:
        RE=readsExtractsS(seed)

    start = False
    num_line=0
    while line:
        # Read sequence
        read = line.split("\n")[0].split("\r")[0]
        if paired != "":
            read_paired = line_paired.split("\n")[0].split("\r")[0]
        if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
            start = True
            off1=filin.tell()
            line = filin.readline()
            if paired != "":
                off2=filin_paired.tell()
                line_paired = filin_paired.readline()
            continue
        if (start == True):
            start = False
            readlen = len(read)
            if readlen < seed:
                line = filin.readline()
                if paired !="":
                    line_paired = filin_paired.readline() # also skip PE2 in that case
                continue
            RE.addRead(read,read_paired)
            d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
            idx_read+=1

        line = filin.readline()
        if paired!="":
            line_paired = filin_paired.readline()
    filin.close()
    if paired !="":
        filin_paired.close()
    return RE,d_rinfo

## use this class to retrieve reads from fastq file.
class ReadGetter:
    ## constructor
    #
    # @param fastq Name of the fastq file that contains the read
    # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file.
    # @param paired The name of the file containing the PE2 (defaults to "").
    def __init__(self,fastq,d_rinfo,paired=""):
        self.filin=open(fastq)
        self.d_rinfo=d_rinfo
        self.paired=paired
        if paired!="":
            self.filinp=open(fastq)

    def getOneRead(self,idx_read):
        read_paired=""
        self.filin.seek(self.d_rinfo[idx_read].offset1)
        read=self.filin.readline()
        if self.paired!="":
            self.filinp.seek(self.d_rinfo[idx_read].offset2)
            read_paired = self.filinp.readline()
        return read,read_paired


### READS Functions
def totReads(filin):
    """Verify and retrieve the number of reads in the fastq file before alignment"""
    if filin.endswith('.gz'):
        filein = gzip.open(filin, 'rb')
    else:
        filein = open(filin, 'r')

    line = 0
    while filein.readline():
        line += 1
    seq = float(round(line / 4))
    filein.close()
    return seq

### SEQUENCE parsing function
def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0):
    """Get genome sequence from fasta file"""
    print("recovering genome from: ",filin)
    print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
    if filin == "":
        return "", "", ""

    infile = open(filin, 'r')
    name = []
    genome = []
    genome_line = ""
    genome_rejected = 0
    for line in infile:
        if line[0] == ">":
            if name != []:
                if len(genome_line) >= limit_reference:
                    genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
                else:
                    genome_rejected += 1
                    name = name[:-1]
                genome_line = ""
            name.append(line[1:].split('\r')[0].split('\n')[0])
        else:
            genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0]

    if len(genome_line) >= limit_reference:
        genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
        genome_line = ""
    else:
        genome_rejected += 1
        name = name[:-1]

    infile.close()

    if host_test:
        return "".join(genome)
    else:
        return genome, name, genome_rejected
    close(filin)