Mercurial > repos > mvdbeek > damid_deseq2_to_peaks

import click
import pandas as pd
import numpy as np


def order_index(df):
    """
    Split chr_start_stop in df index and order by chrom and start.
    """
    idx = df.index.str.split('_')
    idx = pd.DataFrame.from_records(list(idx))

    idx.columns = ['chr', 'start', 'stop']
    idx = idx.astype(dtype={"chr": "object",
                            "start": "int32",
                            "stop": "int32"})
    coordinates = idx.sort_values(['chr', 'start'])
    df.index = np.arange(len(df.index))
    df = df.loc[coordinates.index]
    df = coordinates.join(df)
    # index is center of GATC site
    df.index = df['start'] + 2
    return df


def significant_gatcs_to_peaks(df, p_value_cutoff):
    # Add `pass` column for sig. GATCs
    df['pass'] = 0
    df.loc[(df[6] < p_value_cutoff) & (df[2] > 0), 'pass'] = 1
    # Create pass_id column for consecutive pass or no-pass GATCs
    # True whenever there is a value change (from previous value):
    df['pass_id'] = df.groupby('chr')['pass'].diff().ne(0).cumsum()
    gb = df.groupby('pass_id')
    # aggregate
    consecutive_gatcs = gb.aggregate({'chr': np.min, 'start': np.min, 'stop': np.max, 'pass': np.max})
    # keep only groups with 2 or more GATCS
    s = gb.size() > 1
    consecutive_only = consecutive_gatcs[s]
    # drop GATC groups that are not significant
    peaks = consecutive_only[consecutive_only['pass'] == 1][['chr', 'start', 'stop']]
    # calculate region that is not covered.
    no_peaks = consecutive_only[consecutive_only['pass'] == 0][['chr', 'start', 'stop']]
    s = no_peaks['stop'] - no_peaks['start']
    print("%s nt not covered by peaks" % s.sum())
    s = peaks['stop'] - peaks['start']
    print("%s nt covered by peaks" % s.sum())
    return peaks


@click.command()
@click.argument('input_path', type=click.Path(exists=True))
@click.argument('output_path', type=click.Path())
@click.option('--p_value_cutoff', type=float, default=0.01, help="Minimum adjusted p-value for a significant GATC site")
def deseq2_gatc_to_peak(input_path, output_path, p_value_cutoff):
    df = pd.read_csv(input_path, sep='\t', header=None, index_col=0)
    df = order_index(df)
    peaks = significant_gatcs_to_peaks(df, p_value_cutoff)
    peaks.to_csv(output_path, sep='\t', header=None, index=None)


if __name__ == '__main__':
    deseq2_gatc_to_peak()
author	mvdbeek
date	Tue, 08 Jan 2019 04:01:54 -0500
parents	3fd7995da4fd
children