comparison nf/subworkflows/ncbi/setup/main.nf @ 0:d9c5c5b87fec draft

planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author fubar
date Sat, 03 Aug 2024 11:16:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d9c5c5b87fec
1 #!/usr/bin/env nextflow
2 nextflow.enable.dsl=2
3
4 include { merge_params } from '../utilities'
5
6 workflow setup_genome {
7 take:
8 genome
9 organelles
10 parameters // Map : extra parameter and parameter update
11 main:
12 get_genome_info(genome, organelles, parameters)
13 emit:
14 seqid_list = get_genome_info.out.seqid_list
15 gencoll_asn = get_genome_info.out.gencoll_asn
16 unpacked_genome = get_genome_info.out.fasta
17 genome_asn = get_genome_info.out.genome_asn
18 genome_asnb = get_genome_info.out.genome_asnb
19 max_intron = get_genome_info.out.max_intron
20 }
21
22
23 process get_genome_info {
24 debug true
25 input:
26 path fasta_genome_file, stageAs: 'src/*'
27 path organelles
28 val parameters
29 output:
30 path '*.seqids', emit: 'seqid_list'
31 path '*.asn', emit: 'gencoll_asn'
32 path "${out_fasta}", emit: 'fasta'
33 path "${genome_asn}", emit: 'genome_asn'
34 path "${genome_asnb}", emit: 'genome_asnb'
35 env max_intron, emit: 'max_intron'
36 script:
37 need_zcat = fasta_genome_file.toString().endsWith('.gz')
38 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "")
39 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta")
40 if (! indexed_fasta_name.endsWith(".fasta")) {
41 indexed_fasta_name += ".fasta"
42 }
43 genome_dir = "genome"
44 fasta_dir = "fasta"
45 out_fasta = fasta_dir + "/" + indexed_fasta_name
46 genome_asn = genome_dir + "/" + base_name_stripped + ".asn"
47 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb"
48 max_intron = parameters.max_intron
49 genome_size_threshold = parameters.genome_size_threshold
50 """
51 # echo "need_zcat: ${need_zcat}, out_fasta: ${out_fasta}"
52 mkdir -p ${genome_dir}
53 mkdir -p ${fasta_dir}
54 if [[ ${need_zcat} == true ]]; then
55 # zcat ${fasta_genome_file} | sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' > ${out_fasta}
56 # zcat ${fasta_genome_file} > ${out_fasta}
57 zcat ${fasta_genome_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta}
58 else
59 # sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' ${fasta_genome_file} > ${out_fasta}
60 # mv ${fasta_genome_file} ${out_fasta}
61 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_genome_file} > ${out_fasta}
62 fi
63 # Old way, now use gc_get_molecules. For multipart ids with gi first use the second part
64 # grep -oP "^>\\K[^ ]+" ${out_fasta} | sed 's/^\\(gi|[0-9]\\+\\)|\\([^|]\\+|[^|]\\+\\)|\\?/\\2/' >list.seqids
65 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${genome_asn}
66 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${genome_asnb}
67 lds2_indexer -source ${genome_dir}/ -db LDS2
68 # Using all parts of multipart ids is preferrable, but slower - one more pass over genomic FASTA
69 gc_create -unplaced ${out_fasta} -unplaced-fmt fasta -fasta-parse-raw-id -gc-assm-name "EGAPx Test Assembly" -nogenbank -lds2 LDS2 >gencoll.asn
70 gc_get_molecules -gc-assembly gencoll.asn -filter all -level top-level > list.seqids
71
72 #TODO: subtract organelles from list
73
74 # This is a rough estimate because we don't need the more accurate size
75 genome_size=`wc -c <${out_fasta}`
76 # Max intron logic
77 if [ $genome_size_threshold -gt 0 ] && [ \$genome_size -lt $genome_size_threshold ]; then
78 # scale max intron to genome size, rounding up to nearest 100kb
79 (( max_intron = ($max_intron * genome_size / $genome_size_threshold + 99999) / 100000 * 100000 ))
80 # echo "Setting max_intron to \$max_intron"
81 else
82 max_intron=$max_intron
83 fi
84 """
85
86 stub:
87 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "")
88 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta")
89 if (! indexed_fasta_name.endsWith(".fasta")) {
90 indexed_fasta_name += ".fasta"
91 }
92 genome_dir = "genome"
93 fasta_dir = "fasta"
94 out_fasta = fasta_dir + "/" + indexed_fasta_name
95 genome_asn = genome_dir + "/" + base_name_stripped + ".asn"
96 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb"
97 """
98 mkdir -p $genome_dir
99 mkdir -p $fasta_dir
100 touch $out_fasta
101 touch $genome_asn
102 touch $genome_asnb
103 touch gencoll.asn
104 touch list.seqids
105 max_intron=10000
106 echo "Processing genome $fasta_genome_file"
107 echo "Setting max_intron to \$max_intron"
108 """
109 }
110
111
112 workflow setup_proteins {
113 take:
114 proteins
115 parameters // Map : extra parameter and parameter update
116 main:
117 convert_proteins(proteins)
118 emit:
119 unpacked_proteins = convert_proteins.out.unpacked_proteins
120 proteins_asn = convert_proteins.out.proteins_asn
121 proteins_asnb = convert_proteins.out.proteins_asnb
122 }
123
124
125 process convert_proteins {
126 input:
127 path fasta_proteins_file, stageAs: 'src/*'
128 output:
129 path out_fasta, emit: 'unpacked_proteins'
130 path proteins_asn, emit: 'proteins_asn'
131 path proteins_asnb, emit: 'proteins_asnb'
132 script:
133 need_zcat = fasta_proteins_file.toString().endsWith('.gz')
134 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "")
135 fasta_name = base_name_stripped + ".faa"
136
137 asn_dir = "asn"
138 fasta_dir = "fasta"
139 out_fasta = fasta_dir + "/" + fasta_name
140 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn"
141 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb"
142 """
143 mkdir -p ${asn_dir}
144 mkdir -p ${fasta_dir}
145 if [[ ${need_zcat} == true ]]; then
146 zcat ${fasta_proteins_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta}
147 else
148 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_proteins_file} > ${out_fasta}
149 fi
150 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${proteins_asn}
151 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${proteins_asnb}
152 """
153
154 stub:
155 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "")
156 fasta_name = base_name_stripped + ".faa"
157 asn_dir = "asn"
158 fasta_dir = "fasta"
159 out_fasta = fasta_dir + "/" + fasta_name
160 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn"
161 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb"
162
163 """
164 mkdir -p $asn_dir
165 mkdir -p $fasta_dir
166 touch $out_fasta
167 touch $proteins_asn
168 touch $proteins_asnb
169 """
170
171 }