Mercurial > repos > mvdbeek > damid_deseq2_to_bedgraph
view damid_to_bedgraph.py @ 0:755cbe6825b5 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_bedgraph commit 98722d2ca8205595f032361072aaab450e5f4f83
author | mvdbeek |
---|---|
date | Fri, 14 Dec 2018 06:27:41 -0500 |
parents | |
children |
line wrap: on
line source
from collections import OrderedDict import click import numpy as np import pandas as pd import traces def order_index(df): """ Split chr_start_stop in df index and order by chrom and start. """ idx = df.index.str.split('_') idx = pd.DataFrame.from_records(list(idx)) idx.columns = ['chr', 'start', 'stop'] idx = idx.astype(dtype={"chr": "object", "start": "int32", "stop": "int32"}) coordinates = idx.sort_values(['chr', 'start']) df.index = np.arange(len(df.index)) df = df.loc[coordinates.index] df = coordinates.join(df) # index is center of GATC site df.index = df['start'] + 2 return df def interpolate_values(df, sampling_width=100): result = [] for chrom in df['chr'].unique(): chrom_df = df[df['chr'] == chrom] time_series = traces.TimeSeries(chrom_df['log2FC']) s = pd.DataFrame.from_records(time_series.sample(sampling_width, interpolate='linear')) # Calculate new start and end of interpolated region start = s[0] - int(sampling_width / 2) start.loc[start < 0] = 1 end = s[0] + int(sampling_width / 2) result.append(pd.DataFrame(OrderedDict([('chr', chrom), ('start', start), ('end', end), ('score', s[1])]))) return pd.concat(result) @click.command() @click.argument('input_path', type=click.Path(exists=True), required=True) @click.argument('output_path', type=click.Path(exists=False), required=True) @click.option('--resolution', help="Interpolate log2 fold change at this resolution (in basepairs)", default=50) def deseq2_to_bedgraph(input_path, output_path, resolution=50): """Convert deseq2 output on GATC fragments to bedgraph file with interpolated values.""" df = pd.read_csv(input_path, sep='\t', header=None, index_col=0, usecols=[0, 2], names=['GATC', 'log2FC']) df = df[~df.index.str.contains('\.')] df = order_index(df) r = interpolate_values(df, sampling_width=resolution) r.to_csv(output_path, sep='\t', header=None, index=None) if __name__ == '__main__': deseq2_to_bedgraph()