Mercurial > repos > qfabrepo > metadegalaxy_uc2otutable
comparison uclust2otutable.py @ 0:e85e7ba38aff draft
"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/uc2otutable commit 0db3cb4e9a87400bb2f8402ffc23334e24ad4b4e-dirty"
| author | qfabrepo |
|---|---|
| date | Mon, 14 Sep 2020 04:52:36 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e85e7ba38aff |
|---|---|
| 1 import sys | |
| 2 import progress | |
| 3 import subprocess | |
| 4 import tempfile | |
| 5 import traceback | |
| 6 import argparse | |
| 7 | |
| 8 parser = argparse.ArgumentParser( | |
| 9 description="This script converts uclust format from vsearch to tabular format" | |
| 10 ) | |
| 11 parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0") | |
| 12 parser.add_argument("-i","--input",dest="uclust",default=False,help="input filename in uclust format") | |
| 13 parser.add_argument("-o","--output",dest="otutable",default=False,help="output filename") | |
| 14 | |
| 15 | |
| 16 if(len(sys.argv) == 1): | |
| 17 parser.print_help(sys.stderr) | |
| 18 sys.exit() | |
| 19 | |
| 20 args = parser.parse_args() | |
| 21 | |
| 22 ucFileName = args.uclust | |
| 23 outFileName = args.otutable | |
| 24 | |
| 25 | |
| 26 # Tab-separated fields: | |
| 27 # 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=Label | |
| 28 # Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NotMatched | |
| 29 # For C and D types, PctId is average id with seed. | |
| 30 # QueryStart and SeedStart are zero-based relative to start of sequence. | |
| 31 # If minus strand, SeedStart is relative to reverse-complemented seed. | |
| 32 | |
| 33 MaxError = -1 | |
| 34 | |
| 35 Type = '?' | |
| 36 ClusterNr = -1 | |
| 37 Size = -1 | |
| 38 PctId = -1.0 | |
| 39 LocalScore = -1.0 | |
| 40 Evalue = -1.0 | |
| 41 Strand = '.' | |
| 42 QueryStart = -1 | |
| 43 SeedStart = -1 | |
| 44 Alignment = "" | |
| 45 QueryLabel = "" | |
| 46 TargetLabel = "" | |
| 47 FileName = "?" | |
| 48 Line = "" | |
| 49 | |
| 50 TRUNC_LABELS=0 | |
| 51 | |
| 52 def GetSampleId(Label): | |
| 53 sep=";" | |
| 54 SampleID_temp = Label.split(sep,1)[0] | |
| 55 SampleID = SampleID_temp.split('_',1)[-1] | |
| 56 return SampleID | |
| 57 | |
| 58 def OnRec(): | |
| 59 global OTUs, Samples, OTUTable | |
| 60 if Type != 'H': | |
| 61 return | |
| 62 | |
| 63 OTUId = TargetLabel | |
| 64 if OTUId not in OTUIds: | |
| 65 OTUIds.append(OTUId) | |
| 66 OTUTable[OTUId] = {} | |
| 67 | |
| 68 SampleId = GetSampleId(QueryLabel) | |
| 69 if SampleId not in SampleIds: | |
| 70 SampleIds.append(SampleId) | |
| 71 | |
| 72 N = GetSizeFromLabel(QueryLabel, 1) | |
| 73 try: | |
| 74 OTUTable[OTUId][SampleId] += N | |
| 75 except: | |
| 76 OTUTable[OTUId][SampleId] = N | |
| 77 | |
| 78 def Die(Msg): | |
| 79 print >> sys.stderr | |
| 80 print >> sys.stderr | |
| 81 | |
| 82 traceback.print_stack() | |
| 83 s = "" | |
| 84 for i in range(0, len(sys.argv)): | |
| 85 if i > 0: | |
| 86 s += " " | |
| 87 s += sys.argv[i] | |
| 88 print >> sys.stderr, s | |
| 89 print >> sys.stderr, "**ERROR**", Msg | |
| 90 print >> sys.stderr | |
| 91 print >> sys.stderr | |
| 92 sys.exit(1) | |
| 93 print("NOTHERE!!") | |
| 94 | |
| 95 def Warning(Msg): | |
| 96 print >> sys.stderr | |
| 97 print >> sys.stderr, sys.argv | |
| 98 print >> sys.stderr, "**WARNING**", Msg | |
| 99 | |
| 100 def isgap(c): | |
| 101 return c == '-' or c == '.' | |
| 102 | |
| 103 def GetSeqCount(FileName): | |
| 104 Tmp = tempfile.TemporaryFile() | |
| 105 try: | |
| 106 TmpFile = Tmp.file | |
| 107 except: | |
| 108 TmpFile = Tmp | |
| 109 s = subprocess.call([ "grep", "-c", "^>", FileName ], stdout=TmpFile) | |
| 110 TmpFile.seek(0) | |
| 111 s = TmpFile.read() | |
| 112 return int(s) | |
| 113 | |
| 114 def GetSeqsDict(FileName): | |
| 115 return ReadSeqsFast(FileName, False) | |
| 116 | |
| 117 def ReadSeqsDict(FileName, Progress = False): | |
| 118 return ReadSeqsFast(FileName, Progress) | |
| 119 | |
| 120 def ReadSeqsOnSeq(FileName, OnSeq, Progress = False): | |
| 121 ReadSeqs3(FileName, OnSeq, Progress) | |
| 122 | |
| 123 def ReadSeqsFastFile(File, Progress = False): | |
| 124 Seqs = {} | |
| 125 Id = "" | |
| 126 N = 0 | |
| 127 while 1: | |
| 128 if N%10000 == 0 and Progress: | |
| 129 sys.stderr.write("%u seqs\r" % (N)) | |
| 130 Line = File.readline() | |
| 131 if len(Line) == 0: | |
| 132 if Progress: | |
| 133 sys.stderr.write("%u seqs\n" % (N)) | |
| 134 return Seqs | |
| 135 if len(Line) == 0: | |
| 136 continue | |
| 137 Line = Line.strip() | |
| 138 if Line[0] == ">": | |
| 139 N += 1 | |
| 140 Id = Line[1:] | |
| 141 if TRUNC_LABELS: | |
| 142 Id = Id.split()[0] | |
| 143 Seqs[Id] = "" | |
| 144 else: | |
| 145 if Id == "": | |
| 146 Die("FASTA file does not start with '>'") | |
| 147 Seqs[Id] = Seqs[Id] + Line | |
| 148 | |
| 149 def ReadSeqsFast(FileName, Progress = True): | |
| 150 File = open(FileName) | |
| 151 return ReadSeqsFastFile(File, Progress) | |
| 152 | |
| 153 def ReadSeqs(FileName, toupper=False, stripgaps=False, Progress=False): | |
| 154 if not toupper and not stripgaps: | |
| 155 return ReadSeqsFast(FileName, False) | |
| 156 | |
| 157 Seqs = {} | |
| 158 Id = "" | |
| 159 File = open(FileName) | |
| 160 while 1: | |
| 161 Line = File.readline() | |
| 162 if len(Line) == 0: | |
| 163 return Seqs | |
| 164 Line = Line.strip() | |
| 165 if len(Line) == 0: | |
| 166 continue | |
| 167 if Line[0] == ">": | |
| 168 Id = Line[1:] | |
| 169 if TRUNC_LABELS: | |
| 170 Id = Id.split()[0] | |
| 171 if Id in Seqs.keys(): | |
| 172 Die("Duplicate id '%s' in '%s'" % (Id, FileName)) | |
| 173 Seqs[Id] = "" | |
| 174 else: | |
| 175 if Id == "": | |
| 176 Die("FASTA file '%s' does not start with '>'" % FileName) | |
| 177 if toupper: | |
| 178 Line = Line.upper() | |
| 179 if stripgaps: | |
| 180 Line = Line.replace("-", "") | |
| 181 Line = Line.replace(".", "") | |
| 182 Seqs[Id] = Seqs[Id] + Line | |
| 183 | |
| 184 def ReadSeqs2(FileName, ShowProgress = True): | |
| 185 Seqs = [] | |
| 186 Labels = [] | |
| 187 File = open(FileName) | |
| 188 if ShowProgress: | |
| 189 progress.InitFile(File, FileName) | |
| 190 while 1: | |
| 191 progress.File() | |
| 192 Line = File.readline() | |
| 193 if len(Line) == 0: | |
| 194 if ShowProgress: | |
| 195 print >> sys.stderr, "\n" | |
| 196 return Labels, Seqs | |
| 197 Line = Line.strip() | |
| 198 if len(Line) == 0: | |
| 199 continue | |
| 200 if Line[0] == ">": | |
| 201 Id = Line[1:] | |
| 202 if TRUNC_LABELS: | |
| 203 Id = Id.split()[0] | |
| 204 Labels.append(Id) | |
| 205 Seqs.append("") | |
| 206 else: | |
| 207 i = len(Seqs)-1 | |
| 208 Seqs[i] = Seqs[i] + Line | |
| 209 | |
| 210 def ReadSeqs3(FileName, OnSeq, ShowProgress = True): | |
| 211 File = open(FileName) | |
| 212 if ShowProgress: | |
| 213 progress.InitFile(File, FileName) | |
| 214 Label = "" | |
| 215 Seq = "" | |
| 216 while 1: | |
| 217 Line = File.readline() | |
| 218 if len(Line) == 0: | |
| 219 if Seq != "": | |
| 220 OnSeq(Label, Seq) | |
| 221 if ShowProgress: | |
| 222 print >> sys.stderr, "\n" | |
| 223 return | |
| 224 Line = Line.strip() | |
| 225 if len(Line) == 0: | |
| 226 continue | |
| 227 if Line[0] == ">": | |
| 228 if Seq != "": | |
| 229 if ShowProgress: | |
| 230 progress.File() | |
| 231 if TRUNC_LABELS: | |
| 232 Label = Label.split()[0] | |
| 233 OnSeq(Label, Seq) | |
| 234 Label = Line[1:] | |
| 235 Seq = "" | |
| 236 else: | |
| 237 Seq += Line | |
| 238 | |
| 239 def WriteSeq(File, Seq, Label = ""): | |
| 240 if Label != "": | |
| 241 print >> File, ">" + Label | |
| 242 BLOCKLENGTH = 80 | |
| 243 SeqLength = len(Seq) | |
| 244 BlockCount = int((SeqLength + (BLOCKLENGTH-1))/BLOCKLENGTH) | |
| 245 for BlockIndex in range(0, BlockCount): | |
| 246 Block = Seq[BlockIndex*BLOCKLENGTH:] | |
| 247 Block = Block[:BLOCKLENGTH] | |
| 248 print >> File, Block | |
| 249 | |
| 250 def GetSizeFromLabel(Label, Default = -1): | |
| 251 Fields = Label.split(";") | |
| 252 for Field in Fields: | |
| 253 if Field.startswith("size="): | |
| 254 return int(Field[5:]) | |
| 255 if Default == -1: | |
| 256 Die("Missing size >" + Label) | |
| 257 return Default | |
| 258 | |
| 259 def StripSizeFromLabel(Label): | |
| 260 Fields = Label.split(";") | |
| 261 NewLabel = "" | |
| 262 for Field in Fields: | |
| 263 if Field.startswith("size="): | |
| 264 continue | |
| 265 if NewLabel != "": | |
| 266 NewLabel += ";" | |
| 267 NewLabel += Field | |
| 268 return NewLabel | |
| 269 | |
| 270 def GetQualFromLabel(Label): | |
| 271 n = Label.find("qual=") | |
| 272 assert n >= 0 | |
| 273 return Label[n+5:-1] | |
| 274 | |
| 275 def StripQualFromLabel(Label): | |
| 276 n = Label.find("qual=") | |
| 277 assert n >= 0 | |
| 278 return Label[:n] | |
| 279 | |
| 280 def GetField(Label, Name, Default): | |
| 281 Fields = Label.split(';') | |
| 282 for Field in Fields: | |
| 283 if Field.startswith(Name + "="): | |
| 284 n = len(Name) + 1 | |
| 285 return Field[n:] | |
| 286 if Default == "": | |
| 287 Die("Field %s= not found in >%s" % (Name, Label)) | |
| 288 return Default | |
| 289 | |
| 290 def GetIntFieldFromLabel(Label, Name, Default): | |
| 291 return int(GetField(Label, Name, Default)) | |
| 292 | |
| 293 def GetFieldFromLabel(Label, Name, Default): | |
| 294 return GetField(Label, Name, Default) | |
| 295 | |
| 296 def DeleteFieldFromLabel(Label, Name): | |
| 297 NewLabel = "" | |
| 298 Fields = Label.split(';') | |
| 299 for Field in Fields: | |
| 300 if len(Field) > 0 and not Field.startswith(Name + "="): | |
| 301 NewLabel += Field + ';' | |
| 302 return NewLabel | |
| 303 | |
| 304 def ReplaceSize(Label, Size): | |
| 305 Fields = Label.split(";") | |
| 306 NewLabel = "" | |
| 307 Done = False | |
| 308 for Field in Fields: | |
| 309 if Field.startswith("size="): | |
| 310 NewLabel += "size=%u;" % Size | |
| 311 Done = True | |
| 312 else: | |
| 313 if Field != "": | |
| 314 NewLabel += Field + ";" | |
| 315 if not Done: | |
| 316 die.Die("size= not found in >" + Label) | |
| 317 return NewLabel | |
| 318 | |
| 319 def Error(s): | |
| 320 print >> sys.stderr, "*** ERROR ***", s, sys.argv | |
| 321 sys.exit(1) | |
| 322 | |
| 323 def ProgressFile(File, FileSize): | |
| 324 # if not sys.stderr.isatty(): | |
| 325 # return | |
| 326 Pos = File.tell() | |
| 327 Pct = (100.0*Pos)/FileSize | |
| 328 Str = "%s %5.1f%%\r" % (FileName, Pct) | |
| 329 sys.stderr.write(Str) | |
| 330 | |
| 331 def Progress(i, N): | |
| 332 # if not sys.stderr.isatty(): | |
| 333 return | |
| 334 Pct = (100.0*i)/N | |
| 335 Str = "%5.1f%%\r" % Pct | |
| 336 sys.stderr.write(Str) | |
| 337 | |
| 338 def PrintLine(): | |
| 339 print(Line) | |
| 340 | |
| 341 def ParseRec(Line): | |
| 342 global Type | |
| 343 global ClusterNr | |
| 344 global Size | |
| 345 global PctId | |
| 346 global Strand | |
| 347 global QueryStart | |
| 348 global SeedStart | |
| 349 global Alignment | |
| 350 global QueryLabel | |
| 351 global TargetLabel | |
| 352 global LocalScore | |
| 353 global Evalue | |
| 354 | |
| 355 Fields = Line.split("\t") | |
| 356 N = len(Fields) | |
| 357 if N != 9 and N != 10: | |
| 358 Error("Expected 9 or 10 fields in .uc record, got: " + Line) | |
| 359 Type = Fields[0] | |
| 360 | |
| 361 try: | |
| 362 ClusterNr = int(Fields[1]) | |
| 363 except: | |
| 364 ClusterNr = -1 | |
| 365 | |
| 366 try: | |
| 367 Size = int(Fields[2]) | |
| 368 except: | |
| 369 Size = -1 | |
| 370 | |
| 371 Fields2 = Fields[3].split('/') | |
| 372 LocalScore = -1.0 | |
| 373 Evalue = -1.0 | |
| 374 if len(Fields2) == 3: | |
| 375 try: | |
| 376 PctId = float(Fields2[0]) | |
| 377 LocalScore = float(Fields2[1]) | |
| 378 Evalue = float(Fields2[2]) | |
| 379 except: | |
| 380 PctId = -1.0 | |
| 381 else: | |
| 382 try: | |
| 383 PctId = float(Fields[3]) | |
| 384 except: | |
| 385 PctId = -1.0 | |
| 386 | |
| 387 Strand = Fields[4] | |
| 388 | |
| 389 try: | |
| 390 QueryStart = int(Fields[5]) | |
| 391 except: | |
| 392 QueryStart = -1 | |
| 393 | |
| 394 try: | |
| 395 SeedStart = int(Fields[6]) | |
| 396 except: | |
| 397 SeedStart = -1 | |
| 398 | |
| 399 Alignment = Fields[7] | |
| 400 QueryLabel = Fields[8] | |
| 401 | |
| 402 if len(Fields) > 9: | |
| 403 TargetLabel = Fields[9] | |
| 404 | |
| 405 def GetRec(File, OnRecord): | |
| 406 global Line | |
| 407 while 1: | |
| 408 Line = File.readline() | |
| 409 if len(Line) == 0: | |
| 410 return 0 | |
| 411 if Line[0] == '#': | |
| 412 continue | |
| 413 Line = Line.strip() | |
| 414 if len(Line) == 0: | |
| 415 return 1 | |
| 416 ParseRec(Line) | |
| 417 Ok = OnRecord() | |
| 418 if Ok != None and Ok == 0: | |
| 419 return 0 | |
| 420 return 1 | |
| 421 | |
| 422 def ReadRecs(argFileName, OnRecord, ShowProgress = False): | |
| 423 return ReadFile(argFileName, OnRecord, ShowProgress) | |
| 424 | |
| 425 def ReadRecsOnRec(argFileName, OnRecord, ShowProgress = True): | |
| 426 return ReadFile(argFileName, OnRecord, ShowProgress) | |
| 427 | |
| 428 def GetRecs(argFileName, OnRecord, ShowProgress = True): | |
| 429 return ReadFile(argFileName, OnRecord, ShowProgress) | |
| 430 | |
| 431 def ReadFile(argFileName, OnRecord, ShowProgress = True): | |
| 432 global FileName | |
| 433 FileName = argFileName | |
| 434 File = open(FileName) | |
| 435 | |
| 436 if ShowProgress: | |
| 437 progress.InitFile(File, FileName) | |
| 438 while GetRec(File, OnRecord): | |
| 439 if ShowProgress: | |
| 440 progress.File() | |
| 441 if ShowProgress: | |
| 442 progress.FileDone() | |
| 443 | |
| 444 OTUIds = [] | |
| 445 SampleIds = [] | |
| 446 OTUTable = {} | |
| 447 | |
| 448 ReadRecs(ucFileName, OnRec) | |
| 449 | |
| 450 fout=open(outFileName,'w') | |
| 451 | |
| 452 s = "OTUId" | |
| 453 for SampleId in SampleIds: | |
| 454 s += "\t" + SampleId | |
| 455 | |
| 456 fout.write("%s\n" % s) | |
| 457 | |
| 458 for OTUId in OTUIds: | |
| 459 s = OTUId | |
| 460 for SampleId in SampleIds: | |
| 461 try: | |
| 462 n = OTUTable[OTUId][SampleId] | |
| 463 except: | |
| 464 n = 0 | |
| 465 s += "\t" + str(n) | |
| 466 fout.write("%s\n" % s) | |
| 467 | |
| 468 fout.close() |
