Mercurial > repos > fubar > egapx_runner
annotate nf/subworkflows/ncbi/setup/main.nf @ 5:6effccc966d0 draft
planemo upload for repository https://github.com/ncbi/egapx commit 9e59da535540cb4d5c1c412bb2b0969744dfb0b0
author | fubar |
---|---|
date | Sun, 04 Aug 2024 01:59:37 +0000 |
parents | d9c5c5b87fec |
children |
rev | line source |
---|---|
0
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
1 #!/usr/bin/env nextflow |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
2 nextflow.enable.dsl=2 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
3 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
4 include { merge_params } from '../utilities' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
5 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
6 workflow setup_genome { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
7 take: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
8 genome |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
9 organelles |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
10 parameters // Map : extra parameter and parameter update |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
11 main: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
12 get_genome_info(genome, organelles, parameters) |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
13 emit: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
14 seqid_list = get_genome_info.out.seqid_list |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
15 gencoll_asn = get_genome_info.out.gencoll_asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
16 unpacked_genome = get_genome_info.out.fasta |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
17 genome_asn = get_genome_info.out.genome_asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
18 genome_asnb = get_genome_info.out.genome_asnb |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
19 max_intron = get_genome_info.out.max_intron |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
20 } |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
21 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
22 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
23 process get_genome_info { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
24 debug true |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
25 input: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
26 path fasta_genome_file, stageAs: 'src/*' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
27 path organelles |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
28 val parameters |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
29 output: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
30 path '*.seqids', emit: 'seqid_list' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
31 path '*.asn', emit: 'gencoll_asn' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
32 path "${out_fasta}", emit: 'fasta' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
33 path "${genome_asn}", emit: 'genome_asn' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
34 path "${genome_asnb}", emit: 'genome_asnb' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
35 env max_intron, emit: 'max_intron' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
36 script: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
37 need_zcat = fasta_genome_file.toString().endsWith('.gz') |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
38 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
39 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
40 if (! indexed_fasta_name.endsWith(".fasta")) { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
41 indexed_fasta_name += ".fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
42 } |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
43 genome_dir = "genome" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
44 fasta_dir = "fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
45 out_fasta = fasta_dir + "/" + indexed_fasta_name |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
46 genome_asn = genome_dir + "/" + base_name_stripped + ".asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
47 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
48 max_intron = parameters.max_intron |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
49 genome_size_threshold = parameters.genome_size_threshold |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
50 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
51 # echo "need_zcat: ${need_zcat}, out_fasta: ${out_fasta}" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
52 mkdir -p ${genome_dir} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
53 mkdir -p ${fasta_dir} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
54 if [[ ${need_zcat} == true ]]; then |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
55 # zcat ${fasta_genome_file} | sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
56 # zcat ${fasta_genome_file} > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
57 zcat ${fasta_genome_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
58 else |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
59 # sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' ${fasta_genome_file} > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
60 # mv ${fasta_genome_file} ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
61 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_genome_file} > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
62 fi |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
63 # Old way, now use gc_get_molecules. For multipart ids with gi first use the second part |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
64 # grep -oP "^>\\K[^ ]+" ${out_fasta} | sed 's/^\\(gi|[0-9]\\+\\)|\\([^|]\\+|[^|]\\+\\)|\\?/\\2/' >list.seqids |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
65 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${genome_asn} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
66 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${genome_asnb} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
67 lds2_indexer -source ${genome_dir}/ -db LDS2 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
68 # Using all parts of multipart ids is preferrable, but slower - one more pass over genomic FASTA |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
69 gc_create -unplaced ${out_fasta} -unplaced-fmt fasta -fasta-parse-raw-id -gc-assm-name "EGAPx Test Assembly" -nogenbank -lds2 LDS2 >gencoll.asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
70 gc_get_molecules -gc-assembly gencoll.asn -filter all -level top-level > list.seqids |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
71 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
72 #TODO: subtract organelles from list |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
73 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
74 # This is a rough estimate because we don't need the more accurate size |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
75 genome_size=`wc -c <${out_fasta}` |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
76 # Max intron logic |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
77 if [ $genome_size_threshold -gt 0 ] && [ \$genome_size -lt $genome_size_threshold ]; then |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
78 # scale max intron to genome size, rounding up to nearest 100kb |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
79 (( max_intron = ($max_intron * genome_size / $genome_size_threshold + 99999) / 100000 * 100000 )) |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
80 # echo "Setting max_intron to \$max_intron" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
81 else |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
82 max_intron=$max_intron |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
83 fi |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
84 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
85 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
86 stub: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
87 base_name_stripped = fasta_genome_file.baseName.toString().replaceAll(/\.(fa(sta)?|fna)(\.gz)?$/, "") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
88 indexed_fasta_name = fasta_genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|fna)(\.gz)?$/, ".fasta") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
89 if (! indexed_fasta_name.endsWith(".fasta")) { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
90 indexed_fasta_name += ".fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
91 } |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
92 genome_dir = "genome" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
93 fasta_dir = "fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
94 out_fasta = fasta_dir + "/" + indexed_fasta_name |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
95 genome_asn = genome_dir + "/" + base_name_stripped + ".asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
96 genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
97 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
98 mkdir -p $genome_dir |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
99 mkdir -p $fasta_dir |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
100 touch $out_fasta |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
101 touch $genome_asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
102 touch $genome_asnb |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
103 touch gencoll.asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
104 touch list.seqids |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
105 max_intron=10000 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
106 echo "Processing genome $fasta_genome_file" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
107 echo "Setting max_intron to \$max_intron" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
108 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
109 } |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
110 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
111 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
112 workflow setup_proteins { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
113 take: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
114 proteins |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
115 parameters // Map : extra parameter and parameter update |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
116 main: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
117 convert_proteins(proteins) |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
118 emit: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
119 unpacked_proteins = convert_proteins.out.unpacked_proteins |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
120 proteins_asn = convert_proteins.out.proteins_asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
121 proteins_asnb = convert_proteins.out.proteins_asnb |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
122 } |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
123 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
124 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
125 process convert_proteins { |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
126 input: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
127 path fasta_proteins_file, stageAs: 'src/*' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
128 output: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
129 path out_fasta, emit: 'unpacked_proteins' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
130 path proteins_asn, emit: 'proteins_asn' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
131 path proteins_asnb, emit: 'proteins_asnb' |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
132 script: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
133 need_zcat = fasta_proteins_file.toString().endsWith('.gz') |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
134 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
135 fasta_name = base_name_stripped + ".faa" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
136 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
137 asn_dir = "asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
138 fasta_dir = "fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
139 out_fasta = fasta_dir + "/" + fasta_name |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
140 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
141 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
142 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
143 mkdir -p ${asn_dir} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
144 mkdir -p ${fasta_dir} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
145 if [[ ${need_zcat} == true ]]; then |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
146 zcat ${fasta_proteins_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
147 else |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
148 sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_proteins_file} > ${out_fasta} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
149 fi |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
150 multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${proteins_asn} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
151 multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${proteins_asnb} |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
152 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
153 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
154 stub: |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
155 base_name_stripped = fasta_proteins_file.baseName.toString().replaceAll(/\.(fa(sta)?|faa)(\.gz)?$/, "") |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
156 fasta_name = base_name_stripped + ".faa" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
157 asn_dir = "asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
158 fasta_dir = "fasta" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
159 out_fasta = fasta_dir + "/" + fasta_name |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
160 proteins_asn = asn_dir + "/" + base_name_stripped + ".asn" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
161 proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb" |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
162 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
163 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
164 mkdir -p $asn_dir |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
165 mkdir -p $fasta_dir |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
166 touch $out_fasta |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
167 touch $proteins_asn |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
168 touch $proteins_asnb |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
169 """ |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
170 |
d9c5c5b87fec
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
fubar
parents:
diff
changeset
|
171 } |