diff combine_metaphlan_humann.py @ 3:01ac9954c27f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit b84cbcbffa25c55acd1a31df76e4a4f78772cbd7
author bgruening
date Thu, 20 Jul 2023 10:07:12 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_metaphlan_humann.py	Thu Jul 20 10:07:12 2023 +0000
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+
+
+def extract_clade_abundance(metaphlan_fp):
+    clade_abund = {}
+    with open(metaphlan_fp, "r") as metaphlan_f:
+        is_metaphlan_v4 = False
+        for line in metaphlan_f.readlines():
+            if 'SGB' in line:
+                # New versions of metaphlan against a recent DB contain a header line with DB name, which contains SGB
+                is_metaphlan_v4 = True
+            if line.find("g__") == -1:
+                continue
+
+            split_line = line[:-1].split("\t")
+            taxo = split_line[0]
+            if is_metaphlan_v4:
+                # Column order in new metaphlan versions:
+                # clade_name NCBI_tax_id relative_abundance additional_species
+                abundance = split_line[2]
+            else:
+                # Column order in the old metaphlan versions:
+                # clade_name relative_abundance coverage average_genome_length_in_the_clade estimated_number_of_reads_from_the_clade
+                abundance = split_line[1]
+
+            genus = taxo[(taxo.find("g__") + 3):]
+            if genus.find("|") != -1:
+                genus = genus[: (genus.find("|"))]
+            clade_abund.setdefault(genus, {"abundance": 0, "species": {}})
+            if taxo.find("t__") != -1:
+                continue
+            elif taxo.find("s__") != -1:
+                species = taxo[(taxo.find("s__") + 3):]
+                clade_abund[genus]["species"].setdefault(species, abundance)
+            else:
+                clade_abund[genus]["abundance"] = abundance
+    return clade_abund
+
+
+def compute_overall_abundance(humann_fp):
+    overall_abundance = 0
+    with open(humann_fp, "r") as humann_f:
+        for line in humann_f.readlines():
+            if line.find("|") != -1 or line.startswith("#"):
+                continue
+            split_line = line[:-1].split("\t")
+            overall_abundance += float(split_line[1])
+    return overall_abundance
+
+
+def format_characteristic_name(name):
+    formatted_n = name
+    formatted_n = formatted_n.replace("/", " ")
+    formatted_n = formatted_n.replace("-", " ")
+    formatted_n = formatted_n.replace("'", "")
+    if formatted_n.find("(") != -1 and formatted_n.find(")") != -1:
+        open_bracket = formatted_n.find("(")
+        close_bracket = formatted_n.find(")") + 1
+        formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
+    return formatted_n
+
+
+def combine_metaphlan_humann(args):
+    clade_abund = extract_clade_abundance(args.metaphlan_fp)
+    overall_abund = compute_overall_abundance(args.humann_fp)
+
+    with open(args.output_fp, "w") as output_f:
+        s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t"
+        s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type)
+        output_f.write(s)
+        with open(args.humann_fp, "r") as humann_f:
+            for line in humann_f.readlines():
+                if line.find("|") == -1:
+                    continue
+
+                split_line = line[:-1].split("\t")
+                abundance = 100 * float(split_line[1]) / overall_abund
+                annotation = split_line[0].split("|")
+                charact = annotation[0].split(":")
+                charact_id = charact[0]
+                char_name = ""
+                if len(charact) > 1:
+                    char_name = format_characteristic_name(charact[-1])
+                taxo = annotation[1].split(".")
+
+                if taxo[0] == "unclassified":
+                    continue
+                genus = taxo[0][3:]
+                species = taxo[1][3:]
+
+                if genus not in clade_abund:
+                    print("no %s found in %s" % (genus, args.metaphlan_fp))
+                    continue
+                if species not in clade_abund[genus]["species"]:
+                    print(
+                        "No %s found in %s for % s"
+                        % (species, args.metaphlan_fp, genus)
+                    )
+                    continue
+
+                s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"])
+                s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species])
+                s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
+                output_f.write(s)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--humann_fp", required=True)
+    parser.add_argument("--metaphlan_fp", required=True)
+    parser.add_argument("--output_fp", required=True)
+    parser.add_argument("--type", required=True, choices=["gene_families", "pathways"])
+    args = parser.parse_args()
+
+    combine_metaphlan_humann(args)