Mercurial > repos > fubar > egapx_runner
comparison nf/subworkflows/ncbi/setup/main.nf @ 0:d9c5c5b87fec draft
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author | fubar |
---|---|
date | Sat, 03 Aug 2024 11:16:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d9c5c5b87fec |
---|---|
1 #!/usr/bin/env nextflow | |
2 nextflow.enable.dsl=2 | |
3 | |
4 include { merge_params } from '../utilities' | |
5 | |
6 workflow setup_genome { | |
7 take: | |
8 genome | |
9 organelles | |
10 parameters // Map : extra parameter and parameter update | |
11 main: | |
12 get_genome_info(genome, organelles, parameters) | |
13 emit: | |
14 seqid_list = get_genome_info.out.seqid_list | |
15 gencoll_asn = get_genome_info.out.gencoll_asn | |
16 unpacked_genome = get_genome_info.out.fasta | |
17 genome_asn = get_genome_info.out.genome_asn | |
18 genome_asnb = get_genome_info.out.genome_asnb | |
19 max_intron = get_genome_info.out.max_intron | |
20 } | |
21 | |
22 | |
23 process get_genome_info { | |
24 debug true | |
25 input: | |
26 path fasta_genome_file, stageAs: 'src/*' | |
27 path organelles | |
28 val parameters | |
29 output: | |
30 path '*.seqids', emit: 'seqid_list' | |
31 path '*.asn', emit: 'gencoll_asn' | |
32 path "${out_fasta}", emit: 'fasta' | |
33 path "${genome_asn}", emit: 'genome_asn' | |
34 path "${genome_asnb}", emit: 'genome_asnb' | |
35 env max_intron, emit: 'max_intron' | |
36 script: | |
37 need_zcat = fasta_genome_file.toString().endsWith('.gz') | |
38 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "") | |
39 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta") | |
40 if (! indexed_fasta_name.endsWith(".fasta")) { | |
41 indexed_fasta_name += ".fasta" | |
42 } | |
43 genome_dir = "genome" | |
44 fasta_dir = "fasta" | |
45 out_fasta = fasta_dir + "/" + indexed_fasta_name | |
46 genome_asn = genome_dir + "/" + base_name_stripped + ".asn" | |
47 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb" | |
48 max_intron = parameters.max_intron | |
49 genome_size_threshold = parameters.genome_size_threshold | |
50 """ | |
51 # echo "need_zcat: ${need_zcat}, out_fasta: ${out_fasta}" | |
52 mkdir -p ${genome_dir} | |
53 mkdir -p ${fasta_dir} | |
54 if [[ ${need_zcat} == true ]]; then | |
55 # zcat ${fasta_genome_file} | sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' > ${out_fasta} | |
56 # zcat ${fasta_genome_file} > ${out_fasta} | |
57 zcat ${fasta_genome_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} | |
58 else | |
59 # sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' ${fasta_genome_file} > ${out_fasta} | |
60 # mv ${fasta_genome_file} ${out_fasta} | |
61 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_genome_file} > ${out_fasta} | |
62 fi | |
63 # Old way, now use gc_get_molecules. For multipart ids with gi first use the second part | |
64 # grep -oP "^>\\K[^ ]+" ${out_fasta} | sed 's/^\\(gi|[0-9]\\+\\)|\\([^|]\\+|[^|]\\+\\)|\\?/\\2/' >list.seqids | |
65 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${genome_asn} | |
66 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${genome_asnb} | |
67 lds2_indexer -source ${genome_dir}/ -db LDS2 | |
68 # Using all parts of multipart ids is preferrable, but slower - one more pass over genomic FASTA | |
69 gc_create -unplaced ${out_fasta} -unplaced-fmt fasta -fasta-parse-raw-id -gc-assm-name "EGAPx Test Assembly" -nogenbank -lds2 LDS2 >gencoll.asn | |
70 gc_get_molecules -gc-assembly gencoll.asn -filter all -level top-level > list.seqids | |
71 | |
72 #TODO: subtract organelles from list | |
73 | |
74 # This is a rough estimate because we don't need the more accurate size | |
75 genome_size=`wc -c <${out_fasta}` | |
76 # Max intron logic | |
77 if [ $genome_size_threshold -gt 0 ] && [ \$genome_size -lt $genome_size_threshold ]; then | |
78 # scale max intron to genome size, rounding up to nearest 100kb | |
79 (( max_intron = ($max_intron * genome_size / $genome_size_threshold + 99999) / 100000 * 100000 )) | |
80 # echo "Setting max_intron to \$max_intron" | |
81 else | |
82 max_intron=$max_intron | |
83 fi | |
84 """ | |
85 | |
86 stub: | |
87 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "") | |
88 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta") | |
89 if (! indexed_fasta_name.endsWith(".fasta")) { | |
90 indexed_fasta_name += ".fasta" | |
91 } | |
92 genome_dir = "genome" | |
93 fasta_dir = "fasta" | |
94 out_fasta = fasta_dir + "/" + indexed_fasta_name | |
95 genome_asn = genome_dir + "/" + base_name_stripped + ".asn" | |
96 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb" | |
97 """ | |
98 mkdir -p $genome_dir | |
99 mkdir -p $fasta_dir | |
100 touch $out_fasta | |
101 touch $genome_asn | |
102 touch $genome_asnb | |
103 touch gencoll.asn | |
104 touch list.seqids | |
105 max_intron=10000 | |
106 echo "Processing genome $fasta_genome_file" | |
107 echo "Setting max_intron to \$max_intron" | |
108 """ | |
109 } | |
110 | |
111 | |
112 workflow setup_proteins { | |
113 take: | |
114 proteins | |
115 parameters // Map : extra parameter and parameter update | |
116 main: | |
117 convert_proteins(proteins) | |
118 emit: | |
119 unpacked_proteins = convert_proteins.out.unpacked_proteins | |
120 proteins_asn = convert_proteins.out.proteins_asn | |
121 proteins_asnb = convert_proteins.out.proteins_asnb | |
122 } | |
123 | |
124 | |
125 process convert_proteins { | |
126 input: | |
127 path fasta_proteins_file, stageAs: 'src/*' | |
128 output: | |
129 path out_fasta, emit: 'unpacked_proteins' | |
130 path proteins_asn, emit: 'proteins_asn' | |
131 path proteins_asnb, emit: 'proteins_asnb' | |
132 script: | |
133 need_zcat = fasta_proteins_file.toString().endsWith('.gz') | |
134 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "") | |
135 fasta_name = base_name_stripped + ".faa" | |
136 | |
137 asn_dir = "asn" | |
138 fasta_dir = "fasta" | |
139 out_fasta = fasta_dir + "/" + fasta_name | |
140 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn" | |
141 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb" | |
142 """ | |
143 mkdir -p ${asn_dir} | |
144 mkdir -p ${fasta_dir} | |
145 if [[ ${need_zcat} == true ]]; then | |
146 zcat ${fasta_proteins_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} | |
147 else | |
148 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_proteins_file} > ${out_fasta} | |
149 fi | |
150 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${proteins_asn} | |
151 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${proteins_asnb} | |
152 """ | |
153 | |
154 stub: | |
155 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "") | |
156 fasta_name = base_name_stripped + ".faa" | |
157 asn_dir = "asn" | |
158 fasta_dir = "fasta" | |
159 out_fasta = fasta_dir + "/" + fasta_name | |
160 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn" | |
161 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb" | |
162 | |
163 """ | |
164 mkdir -p $asn_dir | |
165 mkdir -p $fasta_dir | |
166 touch $out_fasta | |
167 touch $proteins_asn | |
168 touch $proteins_asnb | |
169 """ | |
170 | |
171 } |