Mercurial > repos > george-weingart > maaslin
diff src/MaaslinToGraphlanAnnotation.py @ 0:e0b5980139d9
maaslin
author | george-weingart |
---|---|
date | Tue, 13 May 2014 22:00:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/MaaslinToGraphlanAnnotation.py Tue May 13 22:00:40 2014 -0400 @@ -0,0 +1,213 @@ +#!/usr/bin/env python +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), +# authored by the Huttenhower lab at the Harvard School of Public Health +# (contact Timothy Tickle, ttickle@hsph.harvard.edu). +##################################################################################### + +__author__ = "Timothy Tickle" +__copyright__ = "Copyright 2012" +__credits__ = ["Timothy Tickle"] +__license__ = "" +__version__ = "" +__maintainer__ = "Timothy Tickle" +__email__ = "ttickle@sph.harvard.edu" +__status__ = "Development" + +import argparse +import csv +import math +from operator import itemgetter +import re +import string +import sys + +#def funcGetColor(fNumeric,fMax): +# if fNumeric>0: +# return("#"+str(int(99*fNumeric/fMax)).zfill(2)+"0000") +# if fNumeric<0: +# return("#00"+str(int(99*abs(fNumeric/fMax))).zfill(2)+"00") +# return("#000000") + +def funcGetColor(fNumeric): + if fNumeric>0: + return sRingPositiveColor + else: + return sRingNegativeColor + +def funcGetAlpha(fNumeric,fMax): + return max(abs(fNumeric/fMax),dMinAlpha) + +#Constants +sAnnotation = "annotation" +sAnnotationColor = "annotation_background_color" +sClass = "class" +sRingAlpha = "ring_alpha" +dMinAlpha = .075 +sRingColor = "ring_color" +sRingHeight = "ring_height" +#sRingHeightMin = 0.5 +sStandardizedRingHeight = "1.01" +sRingLabel = "ring_label" +sRingLabelSizeWord = "ring_label_font_size" +sRingLabelSize = 10 +sRingLineColor = "#999999" +sRingPositiveWord = "Positive" +sRingPositiveColor = "#990000" +sRingNegativeWord = "Negative" +sRingNegativeColor = "#009900" +sRingLineColorWord = "ring_separator_color" +sRingLineThickness = "0.5" +sRingLineThicknessWord = "ring_internal_separator_thickness" +sCladeMarkerColor = "clade_marker_color" +sCladeMarkerSize = "clade_marker_size" +sHighlightedMarkerSize = "10" +c_dMinDoubleValue = 0.00000000001 + +#Set up arguments reader +argp = argparse.ArgumentParser( prog = "MaaslinToGraphlanAnnotation.py", + description = """Converts summary files to graphlan annotation files.""" ) + +#### Read in information +#Arguments +argp.add_argument("strInputSummary", metavar = "SummaryFile", type = argparse.FileType("r"), help ="Input summary file produced by maaslin") +argp.add_argument("strInputCore", metavar = "CoreFile", type = argparse.FileType("r"), help ="Core file produced by Graphlan from the maaslin pcl") +argp.add_argument("strInputHeader", metavar = "HeaderFile", type = argparse.FileType("r"), help ="Input header file to append to the generated annotation file.") +argp.add_argument("strOutputAnnotation", metavar = "AnnotationFile", type = argparse.FileType("w"), help ="Output annotation file for graphlan") + +args = argp.parse_args( ) + +#Read in the summary file and transform to class based descriptions +csvSum = open(args.strInputSummary,'r') if isinstance(args.strInputSummary, str) else args.strInputSummary +fSum = csv.reader(csvSum, delimiter="\t") +#Skip header (until i do this a better way) +fSum.next() + +#Extract associations (Metadata,taxon,coef,qvalue) +lsAssociations = [[sLine[1],sLine[2],sLine[4],sLine[7]] for sLine in fSum] +csvSum.close() + +#### Read in default graphlan settings provided by maaslin +#Read in the annotation header file +csvHdr = open(args.strInputHeader,'r') if isinstance(args.strInputHeader, str) else args.strInputHeader +fHdr = csv.reader(csvHdr, delimiter="\t") + +#Begin writting the output +#Output annotation file +csvAnn = open(args.strOutputAnnotation,'w') if isinstance(args.strOutputAnnotation, str) else args.strOutputAnnotation +fAnn = csv.writer(csvAnn, delimiter="\t") +fAnn.writerows(fHdr) +csvHdr.close() + +#If no associatiosn were found +if(len(lsAssociations)==0): + csvAnn.close() + +else: + #### Fix name formats + #Manipulate names to graphlan complient names (clades seperated by .) + lsAssociations = sorted(lsAssociations, key=itemgetter(1)) + lsAssociations = [[sBug[0]]+[re.sub("^[A-Za-z]__","",sBug[1])]+sBug[2:] for sBug in lsAssociations] + lsAssociations = [[sBug[0]]+[re.sub("\|*[A-Za-z]__|\|",".",sBug[1])]+sBug[2:] for sBug in lsAssociations] + + #If this is an OTU, append the number and the genus level together for a more descriptive termal name + lsAssociationsModForOTU = [] + for sBug in lsAssociations: + lsBug = sBug[1].split(".") + if(len(lsBug))> 1: + if(lsBug[-1].isdigit()): + lsBug[-2]=lsBug[-2]+"_"+lsBug[-1] + lsBug = lsBug[0:-1] + lsAssociationsModForOTU.append([sBug[0]]+[".".join(lsBug)]+sBug[2:]) + else: + lsAssociationsModForOTU.append([sBug[0]]+[lsBug[0]]+sBug[2:]) + + #Extract just class info + #lsClassData = [[sLine[2],sClass,sLine[1]] for sLine in fSum] + + ### Make rings + #Setup rings + dictRings = dict([[enumData[1],enumData[0]] for enumData in enumerate(set([lsData[0] for lsData in lsAssociationsModForOTU]))]) + + #Ring graphlan setting: rings represent a metadata that associates with a feature + #Rings have a line to help differetiate them + lsRingSettings = [[sRingLabel,lsPair[1],lsPair[0]] for lsPair in dictRings.items()] + lsRingLineColors = [[sRingLineColorWord,lsPair[1],sRingLineColor] for lsPair in dictRings.items()] + lsRingLineThick = [[sRingLineThicknessWord,lsPair[1],sRingLineThickness] for lsPair in dictRings.items()] + lsRingLineLabelSize = [[sRingLabelSizeWord,lsPair[1], sRingLabelSize] for lsPair in dictRings.items()] + + #Create coloring for rings color represents the directionality of the relationship + dMaxCoef = max([abs(float(sAssociation[2])) for sAssociation in lsAssociationsModForOTU]) + lsRingColors = [[lsAssociation[1], sRingColor, dictRings[lsAssociation[0]], funcGetColor(float(lsAssociation[2]))] for lsAssociation in lsAssociationsModForOTU] + lsRingAlpha = [[lsAssociation[1], sRingAlpha, dictRings[lsAssociation[0]], funcGetAlpha(float(lsAssociation[2]), dMaxCoef)] for lsAssociation in lsAssociationsModForOTU] + + #Create height for rings representing the log tranformed q-value? + dMaxQValue = max([-1*math.log(max(float(sAssociation[3]), c_dMinDoubleValue)) for sAssociation in lsAssociationsModForOTU]) + #lsRingHeights = [[lsAssociation[1], sRingHeight, dictRings[lsAssociation[0]], ((-1*math.log(max(float(lsAssociation[3]), c_dMinDoubleValue)))/dMaxQValue)+sRingHeightMin] for lsAssociation in lsAssociationsModForOTU] + lsRingHeights = [[lsAssociation[1], sRingHeight, dictRings[lsAssociation[0]], sStandardizedRingHeight] for lsAssociation in lsAssociationsModForOTU] + + #### Marker + # Marker colors (mainly to make legend + lsMarkerColors = [[lsAssociation[1], sCladeMarkerColor, funcGetColor(float(lsAssociation[2]))] for lsAssociation in lsAssociationsModForOTU] + lsMarkerSizes = [[lsAssociation[1], sCladeMarkerSize, sHighlightedMarkerSize] for lsAssociation in lsAssociationsModForOTU] + + #### Make internal highlights + #Highlight the associated clades + lsUniqueAssociatedTaxa = sorted(list(set([lsAssociation[1] for lsAssociation in lsAssociationsModForOTU]))) + + lsHighlights = [] + sABCPrefix = "" + sListABC = string.ascii_lowercase + iListABCIndex = 0 + for lsHighlight in lsUniqueAssociatedTaxa: + lsTaxa = lsHighlight.split(".") + sLabel = sABCPrefix+sListABC[iListABCIndex]+":"+lsTaxa[-1] if len(lsTaxa) > 2 else lsTaxa[-1] + lsHighlights.append([lsHighlight, sAnnotation, sLabel]) + iListABCIndex = iListABCIndex + 1 + if iListABCIndex > 25: + iListABCIndex = 0 + sABCPrefix = sABCPrefix + sListABC[len(sABCPrefix)] + + #Read in the core file + csvCore = open(args.strInputCore,'r') if isinstance(args.strInputCore, str) else args.strInputCore + fSum = csv.reader(csvCore, delimiter="\t") + + #Add in all phylum just incase they were not already included here + lsAddSecondLevel = list(set([sUnique[0].split(".")[1] for sUnique in fSum if len(sUnique[0].split(".")) > 1])) + lsHighlights.extend([[sSecondLevel, sAnnotation, sSecondLevel] for sSecondLevel in lsAddSecondLevel]) + lsHighlightColor = [[lsHighlight[0], sAnnotationColor,"b"] for lsHighlight in lsHighlights] + + #### Write the remaining output annotation file + fAnn.writerows(lsRingSettings) + fAnn.writerows(lsRingLineColors) + fAnn.writerows(lsRingColors) + fAnn.writerows(lsRingAlpha) + fAnn.writerows(lsRingLineThick) + fAnn.writerows(lsRingLineLabelSize) + fAnn.writerows(lsRingHeights) + fAnn.writerows(lsMarkerColors) + fAnn.writerows(lsMarkerSizes) + fAnn.writerows([[sRingPositiveWord, sCladeMarkerColor, sRingPositiveColor]]) + fAnn.writerows([[sRingNegativeWord, sCladeMarkerColor, sRingNegativeColor]]) + fAnn.writerows(lsHighlights) + fAnn.writerows(lsHighlightColor) + csvAnn.close()