changeset 0:3fd7995da4fd draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
author mvdbeek
date Mon, 07 Jan 2019 12:58:55 -0500
parents
children edca422b6cd6
files damid_deseq2_to_peaks.py damid_deseq2_to_peaks.xml test-data/peaks.bed test-data/test_data.tab
diffstat 4 files changed, 96 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/damid_deseq2_to_peaks.py	Mon Jan 07 12:58:55 2019 -0500
@@ -0,0 +1,62 @@
+import click
+import pandas as pd
+import numpy as np
+
+
+def order_index(df):
+    """
+    Split chr_start_stop in df index and order by chrom and start.
+    """
+    idx = df.index.str.split('_')
+    idx = pd.DataFrame.from_records(list(idx))
+
+    idx.columns = ['chr', 'start', 'stop']
+    idx = idx.astype(dtype={"chr": "object",
+                            "start": "int32",
+                            "stop": "int32"})
+    coordinates = idx.sort_values(['chr', 'start'])
+    df.index = np.arange(len(df.index))
+    df = df.loc[coordinates.index]
+    df = coordinates.join(df)
+    # index is center of GATC site
+    df.index = df['start'] + 2
+    return df
+
+
+def significant_gatcs_to_peaks(df, p_value_cutoff):
+    # Add `pass` column for sig. GATCs
+    df['pass'] = 0
+    df.loc[df[6] < p_value_cutoff, 'pass'] = 1
+    # Create pass_id column for consecutive pass or no-pass GATCs
+    # True whenever there is a value change (from previous value):
+    df['pass_id'] = df.groupby('chr')['pass'].diff().ne(0).cumsum()
+    gb = df.groupby('pass_id')
+    # aggregate
+    consecutive_gatcs = gb.aggregate({'chr': np.min, 'start': np.min, 'stop': np.max, 'pass': np.max})
+    # keep only groups with 2 or more GATCS
+    s = gb.size() > 1
+    consecutive_only = consecutive_gatcs[s]
+    # drop GATC groups that are not significant
+    peaks = consecutive_only[consecutive_only['pass'] == 1][['chr', 'start', 'stop']]
+    # calculate region that is not covered.
+    no_peaks = consecutive_only[consecutive_only['pass'] == 0][['chr', 'start', 'stop']]
+    s = no_peaks['stop'] - no_peaks['start']
+    print("%s nt not covered by peaks" % s.sum())
+    s = peaks['stop'] - peaks['start']
+    print("%s nt covered by peaks" % s.sum())
+    return peaks
+
+
+@click.command()
+@click.argument('input_path', type=click.Path(exists=True))
+@click.argument('output_path', type=click.Path())
+@click.option('--p_value_cutoff', type=float, default=0.01, help="Minimum adjusted p-value for a significant GATC site")
+def deseq2_gatc_to_peak(input_path, output_path, p_value_cutoff):
+    df = pd.read_csv(input_path, sep='\t', header=None, index_col=0)
+    df = order_index(df)
+    peaks = significant_gatcs_to_peaks(df, p_value_cutoff)
+    peaks.to_csv(output_path, sep='\t', header=None, index=None)
+
+
+if __name__ == '__main__':
+    deseq2_gatc_to_peak()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/damid_deseq2_to_peaks.xml	Mon Jan 07 12:58:55 2019 -0500
@@ -0,0 +1,25 @@
+<tool id="damid_deseq2_to_peaks" name="DamID deseq2 to peaks" version="0.1.0">
+    <requirements>
+        <requirement type="package" version="7">click</requirement>
+        <requirement type="package" version="0.23.4">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/damid_deseq2_to_peaks.py' '$input' '$output' --p_value_cutoff $p_value_cutoff
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Select DESeq2 result file for GATC sites"/>
+        <param argument="p_value_cutoff" type="float" value="0.01" min="0" max="1" label="Select minimum p-value to consider a GATC significantly bound"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="bed"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="test_data.tab"/>
+            <output name="output" value="peaks.bed"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Converts a deseq2 GATC result file into peaks in bed format.
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/peaks.bed	Mon Jan 07 12:58:55 2019 -0500
@@ -0,0 +1,2 @@
+4	1	251
+4	301	501
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_data.tab	Mon Jan 07 12:58:55 2019 -0500
@@ -0,0 +1,7 @@
+4_1_101	179.471138895981	6.11378960505906	0.621838986245469	9.83178883970079	8.2146464523617e-23	5.91609955253763e-19
+4_101_201	294.81296137271	5.60565739977369	0.577313080365325	9.70990886994353	2.73581605967775e-22	1.38780673739166e-18
+4_201_251	169.22778018355	5.7093345566596	0.605350118969888	9.43145855224256	4.0442550874846e-21	1.15555073230549e-17
+4_251_276	169.22778018355	5.7093345566596	0.605350118969888	9.43145855224256	4.0442550874846e-21	1.15555073230549e-1
+4_276_301	169.22778018355	5.7093345566596	0.605350118969888	9.43145855224256	4.0442550874846e-21	1.15555073230549e-1
+4_301_401	412.221939093451	4.95034623243659	0.53002737065317	9.33979357770923	9.65221128246454e-21	2.46758781436206e-17
+4_401_501	165.881402812843	5.71524958810642	0.620499842499124	9.21071883771587	3.23948149277376e-20	6.66609779385658e-17