diff nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf @ 0:d9c5c5b87fec draft

planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author fubar
date Sat, 03 Aug 2024 11:16:53 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf	Sat Aug 03 11:16:53 2024 +0000
@@ -0,0 +1,157 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl=2
+
+include { merge_params } from '../../utilities'
+
+
+workflow gnomon_wnode {
+    take:
+        scaffolds
+        chains
+        chains_slices
+        hmm_params
+        softmask_lds2
+        softmask_lds2_source
+        genome
+        proteins
+        parameters  // Map : extra parameter and parameter update
+    main:
+        String gpx_qsubmit_params =  merge_params("", parameters, 'gpx_qsubmit')
+        String annot_params =  merge_params("-margin 1000 -mincont 1000 -minlen 225 -mpp 10.0 -ncsp 25 -window 200000 -nonconsens -open", parameters, 'annot_wnode')
+        String gpx_qdump_params =  merge_params("-slices-for affinity -sort-by affinity", parameters, 'gpx_qdump')
+
+        def (jobs, lines_per_file) = gpx_qsubmit(scaffolds, chains, chains_slices, gpx_qsubmit_params)
+        def annot_files = annot(jobs.flatten(), chains, hmm_params, softmask_lds2, softmask_lds2_source, genome, proteins, lines_per_file, annot_params)
+        gpx_qdump(annot_files.collect(), gpx_qdump_params)
+    emit:
+        outputs = gpx_qdump.out.outputs
+}
+
+
+process gpx_qsubmit {
+    input:
+        path scaffolds
+        path chains
+        path chains_slices
+        val params
+    output:
+        path "job.*"
+        env lines_per_file
+    script:
+        njobs=16
+    """
+    echo $scaffolds | tr ' ' '\\n' > scaffolds.mft
+    for file in $chains_slices; do
+        echo \$file >> chains_slices.mft
+        # remove path from the first line of this file
+        sed -i -e '1s/\\(.*\\)\\/\\(.*\\)\$/\\2/' \$file
+    done
+    gpx_qsubmit $params -ids-manifest scaffolds.mft -slices-manifest chains_slices.mft -o jobs
+    total_lines=\$(wc -l <jobs)
+    (( lines_per_file = (total_lines + ${njobs} - 1) / ${njobs} ))
+    echo total_lines=\$total_lines, lines_per_file=\$lines_per_file
+    # split -l\$lines_per_file jobs job. -da 3
+    # Use round robin to distribute jobs across nodes more evenly
+    if [ \$total_lines -lt $njobs ]; then
+        effective_njobs=\$total_lines
+    else
+        effective_njobs=$njobs
+    fi
+    split -nr/\$effective_njobs jobs job. -da 3
+    """
+    stub:
+        njobs=16
+    """
+    for i in {1..$njobs}; do
+        echo j.\${i} >> jobs
+    done
+    split -nr/$njobs jobs job. -da 3
+    lines_per_file=10
+    """
+}
+
+
+process annot {
+    input:
+        path jobs
+        path chains // used for staging chain files, referred from jobs
+        path hmm_params
+        path softmask_lds2
+        path softmask
+        path genome, stageAs: 'indexed/*'
+        path proteins_asn, stageAs: 'indexed/*'
+        val lines_per_file
+        val params
+    output:
+        path "output/*"
+    script:
+        job_num = jobs.toString().tokenize('.').last().toInteger()
+    """
+    njobs=`wc -l <$jobs`
+    if [ \$njobs -lt 16 ]; then
+        threads=\$njobs
+    else
+        threads=16
+    fi
+
+    lds2=indexed_lds
+    if [ -n "$softmask_lds2" ]; then
+        # patch LDS2 to point to the source
+        files=\$(sqlite3 $softmask_lds2 -cmd "SELECT file_name FROM file" ".exit")
+        for f in \$files; do
+            base=\$(basename \$f)
+            sqlite3 $softmask_lds2 -cmd "UPDATE file SET file_name = '\$base' WHERE file_name = '\$f'" ".exit"
+        done
+        lds2+=",$softmask_lds2"
+    elif [ -n "$softmask" ]; then
+        mkdir sm_src
+        mv $softmask ./sm_src/
+        lds2_indexer -source ./sm_src/ -db softmask_lds2
+        lds2+=",softmask_lds2"
+    fi    
+
+    filename=\$(basename -- "$jobs")
+    extension="\${filename##*.}"
+    (( start_job_id = ((10#\$extension) * $lines_per_file) + 1 ))
+
+    # make the local LDS of the genomic fasta
+    lds2_indexer -source indexed -db indexed_lds
+
+    # When running multiple jobs on the cluster there is a chance that
+    # several jobs will run on the same node and thus generate files
+    # with the same filename. We need to avoid that to be able to stage
+    # the output files for gpx_make_outputs. We add the job file numeric
+    # extension as a prefix to the filename.
+    mkdir interim
+    annot_wnode $params -nogenbank -lds2 \$lds2  -start-job-id \$start_job_id -workers \$threads -input-jobs $jobs -param $hmm_params -O interim || true
+    mkdir output
+    for f in interim/*; do
+        if [ -f \$f ]; then
+            mv \$f output/\${extension}_\$(basename \$f)
+        fi
+    done
+    """
+    stub:
+        job_num = jobs.toString().tokenize('.').last().toInteger()
+    """
+    mkdir -p output
+    touch output/sample_gnomon_wnode.${job_num}.out
+    """
+}
+
+
+process gpx_qdump {
+    input:
+        path files, stageAs: "inputs/*"
+        val params
+    output:
+        path "*.out", emit: "outputs"
+    script:
+    """
+    gpx_qdump $params -input-path inputs -output gnomon_wnode.out
+    """
+    stub:
+    """
+    touch gnomon_wnode.out
+    """
+}