comparison nf/subworkflows/ncbi/target_proteins/miniprot/main.nf @ 0:d9c5c5b87fec draft

planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author fubar
date Sat, 03 Aug 2024 11:16:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d9c5c5b87fec
1 #!/usr/bin/env nextflow
2
3 nextflow.enable.dsl=2
4
5 include { merge_params } from '../../utilities'
6
7
8 def get_effective_params(parameters, max_intron) {
9 def default_params = "-t 8 -G ${max_intron}"
10 def value = parameters.get("miniprot", "")
11 value = value.replaceFirst("-cpu-count", "-t")
12 value = value.replaceFirst("-max-intron", "-G")
13 parameters['miniprot'] = value
14 def effective_params = merge_params(default_params, parameters, "miniprot")
15 return effective_params
16 }
17
18 workflow miniprot {
19 take:
20 fasta_genome_file //path: genome fasta file
21 fasta_proteins_file //path: protein fasta file
22 max_intron //int: max intron length
23 parameters // Map : extra parameter and parameter update
24 main:
25 // println("Miniprot max intron: ${max_intron}")
26 def items_per_chunk = merge_params("-n 1000000000", parameters, "split_proteins").replaceFirst("-n ", "").toInteger()
27 def protein_chunks
28 if (items_per_chunk == 1000000000) {
29 protein_chunks = fasta_proteins_file
30 } else {
31 protein_chunks = split_proteins(fasta_proteins_file, items_per_chunk)
32 }
33 run_miniprot(fasta_genome_file, protein_chunks.flatten(), max_intron, parameters)
34
35 emit:
36 miniprot_file = run_miniprot.out.miniprot_file
37 }
38
39
40 process split_proteins {
41 input:
42 path fasta_proteins_file
43 val items_per_chunk
44 output:
45 path 'output/*'
46 script:
47 """
48 #!/usr/bin/env python3
49 import os
50
51 os.makedirs("output", exist_ok=True)
52 with open("${fasta_proteins_file}", 'rt') as f:
53 items = 0
54 chunk = []
55 nextfile = 1
56 for line in f:
57 if line and line[0] == '>':
58 items += 1
59 if items >= ${items_per_chunk}:
60 with open(f"output/{nextfile}.fa", "w") as outf:
61 outf.write(''.join(chunk))
62 chunk = []
63 nextfile += 1
64 items = 1
65 chunk.append(line)
66 if chunk:
67 with open(f"output/{nextfile}.fa", "w") as outf:
68 outf.write(''.join(chunk))
69 """
70 stub:
71 print("items_per_chunk ${items_per_chunk}")
72 """
73 mkdir -p output
74 touch output/1.fa
75 touch output/2.fa
76 touch output/3.fa
77 """
78 }
79
80
81 process run_miniprot {
82 label 'huge_job'
83 label 'long_job'
84 input:
85 path fasta_genome_file
86 path fasta_proteins_file
87 val max_intron
88 val parameters
89 output:
90 path ('output/*.paf'), emit: 'miniprot_file'
91
92 script:
93 def paf_name = fasta_proteins_file.baseName.toString() + ".paf"
94 def effective_params = get_effective_params(parameters, max_intron)
95 // println("Miniprot params: ${effective_params}")
96 """
97 mkdir -p output
98 miniprot ${effective_params} ${fasta_genome_file} ${fasta_proteins_file} > output/${paf_name}
99 """
100 stub:
101 def paf_name = fasta_proteins_file.baseName.toString() + ".paf"
102 def effective_params = get_effective_params(parameters, max_intron)
103 println("Miniprot params: ${effective_params}")
104 """
105 mkdir -p output
106 touch output/${paf_name}
107 """
108 }