Mercurial > repos > mvdbeek > damid_deseq2_to_bedgraph
changeset 0:755cbe6825b5 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_bedgraph commit 98722d2ca8205595f032361072aaab450e5f4f83
author | mvdbeek |
---|---|
date | Fri, 14 Dec 2018 06:27:41 -0500 |
parents | |
children | 210b7f4c9acb |
files | damid_interpolate.xml damid_to_bedgraph.py test-data/line_count_100.tab test-data/line_count_50.tab test-data/test_data.tab |
diffstat | 5 files changed, 110 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/damid_interpolate.xml Fri Dec 14 06:27:41 2018 -0500 @@ -0,0 +1,34 @@ +<tool id="damid_interpolate" name="Interpolate GATC signal" version="0.1.0"> + <description>from DESeq2 processed DamID data</description> + <requirements> + <requirement type="package" version="2.4.1">traces</requirement> + <requirement type="package" version="7">click</requirement> + <requirement type="package" version="0.23.4">pandas</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/damid_to_bedgraph.py' '$input' '$output' --resolution $resolution + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Select DESeq2 result file for GATC sites"/> + <param argument="--resolution" type="integer" value="50" label="Select resolution for intervals"/> + </inputs> + <outputs> + <data name="output" format="bedgraph"/> + </outputs> + <tests> + <test> + <param name="input" value="test_data.tab"/> + <output name="output" value="line_count_50.tab"/> + </test> + <test> + <param name="input" value="test_data.tab"/> + <param name="resolution" value="100"/> + <output name="output" value="line_count_100.tab"/> + </test> + </tests> + <help><![CDATA[ +Takes a deseq2 output file and generate a bedgraph file, where the score column +is the log2 fold change value interpolated for intervals of the size selected in +"Select resolution for intervals". + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/damid_to_bedgraph.py Fri Dec 14 06:27:41 2018 -0500 @@ -0,0 +1,57 @@ +from collections import OrderedDict + +import click +import numpy as np +import pandas as pd +import traces + + +def order_index(df): + """ + Split chr_start_stop in df index and order by chrom and start. + """ + idx = df.index.str.split('_') + idx = pd.DataFrame.from_records(list(idx)) + + idx.columns = ['chr', 'start', 'stop'] + idx = idx.astype(dtype={"chr": "object", + "start": "int32", + "stop": "int32"}) + coordinates = idx.sort_values(['chr', 'start']) + df.index = np.arange(len(df.index)) + df = df.loc[coordinates.index] + df = coordinates.join(df) + # index is center of GATC site + df.index = df['start'] + 2 + return df + + +def interpolate_values(df, sampling_width=100): + result = [] + for chrom in df['chr'].unique(): + chrom_df = df[df['chr'] == chrom] + time_series = traces.TimeSeries(chrom_df['log2FC']) + s = pd.DataFrame.from_records(time_series.sample(sampling_width, interpolate='linear')) + # Calculate new start and end of interpolated region + start = s[0] - int(sampling_width / 2) + start.loc[start < 0] = 1 + end = s[0] + int(sampling_width / 2) + result.append(pd.DataFrame(OrderedDict([('chr', chrom), ('start', start), ('end', end), ('score', s[1])]))) + return pd.concat(result) + + +@click.command() +@click.argument('input_path', type=click.Path(exists=True), required=True) +@click.argument('output_path', type=click.Path(exists=False), required=True) +@click.option('--resolution', help="Interpolate log2 fold change at this resolution (in basepairs)", default=50) +def deseq2_to_bedgraph(input_path, output_path, resolution=50): + """Convert deseq2 output on GATC fragments to bedgraph file with interpolated values.""" + df = pd.read_csv(input_path, sep='\t', header=None, index_col=0, usecols=[0, 2], names=['GATC', 'log2FC']) + df = df[~df.index.str.contains('\.')] + df = order_index(df) + r = interpolate_values(df, sampling_width=resolution) + r.to_csv(output_path, sep='\t', header=None, index=None) + + +if __name__ == '__main__': + deseq2_to_bedgraph()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/line_count_100.tab Fri Dec 14 06:27:41 2018 -0500 @@ -0,0 +1,5 @@ +4 1 53 6.113789605059059 +4 53 153 5.6056573997736905 +4 153 253 5.709334556659599 +4 253 353 4.95034623243659 +4 353 453 5.7152495881064205
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/line_count_50.tab Fri Dec 14 06:27:41 2018 -0500 @@ -0,0 +1,9 @@ +4 1 28 6.113789605059059 +4 28 78 5.859723502416375 +4 78 128 5.6056573997736905 +4 128 178 5.6574959782166445 +4 178 228 5.709334556659599 +4 228 278 5.3298403945480946 +4 278 328 4.95034623243659 +4 328 378 5.3327979102715055 +4 378 428 5.7152495881064205
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_data.tab Fri Dec 14 06:27:41 2018 -0500 @@ -0,0 +1,5 @@ +4_1_101 179.471138895981 6.11378960505906 0.621838986245469 9.83178883970079 8.2146464523617e-23 5.91609955253763e-19 +4_101_201 294.81296137271 5.60565739977369 0.577313080365325 9.70990886994353 2.73581605967775e-22 1.38780673739166e-18 +4_201_301 169.22778018355 5.7093345566596 0.605350118969888 9.43145855224256 4.0442550874846e-21 1.15555073230549e-17 +4_301_401 412.221939093451 4.95034623243659 0.53002737065317 9.33979357770923 9.65221128246454e-21 2.46758781436206e-17 +4_401_501 165.881402812843 5.71524958810642 0.620499842499124 9.21071883771587 3.23948149277376e-20 6.66609779385658e-17