Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/squid/stockholm.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /***************************************************************** | |
2 * SQUID - a library of functions for biological sequence analysis | |
3 * Copyright (C) 1992-2002 Washington University School of Medicine | |
4 * | |
5 * This source code is freely distributed under the terms of the | |
6 * GNU General Public License. See the files COPYRIGHT and LICENSE | |
7 * for details. | |
8 *****************************************************************/ | |
9 | |
10 /* stockholm.c | |
11 * SRE, Fri May 28 15:46:41 1999 | |
12 * | |
13 * Reading/writing of Stockholm format multiple sequence alignments. | |
14 * | |
15 * example of API: | |
16 * | |
17 * MSA *msa; | |
18 * FILE *fp; -- opened for write with fopen() | |
19 * MSAFILE *afp; -- opened for read with MSAFileOpen() | |
20 * | |
21 * while ((msa = ReadStockholm(afp)) != NULL) | |
22 * { | |
23 * WriteStockholm(fp, msa); | |
24 * MSAFree(msa); | |
25 * } | |
26 * | |
27 * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp) | |
28 */ | |
29 #include <stdio.h> | |
30 #include <string.h> | |
31 #include "squid.h" | |
32 #include "msa.h" | |
33 | |
34 static int parse_gf(MSA *msa, char *buf); | |
35 static int parse_gs(MSA *msa, char *buf); | |
36 static int parse_gc(MSA *msa, char *buf); | |
37 static int parse_gr(MSA *msa, char *buf); | |
38 static int parse_comment(MSA *msa, char *buf); | |
39 static int parse_sequence(MSA *msa, char *buf); | |
40 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl); | |
41 | |
42 #ifdef TESTDRIVE_STOCKHOLM | |
43 /***************************************************************** | |
44 * stockholm.c test driver: | |
45 * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm | |
46 * | |
47 */ | |
48 int | |
49 main(int argc, char **argv) | |
50 { | |
51 MSAFILE *afp; | |
52 MSA *msa; | |
53 char *file; | |
54 | |
55 file = argv[1]; | |
56 | |
57 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL) | |
58 Die("Couldn't open %s\n", file); | |
59 | |
60 while ((msa = ReadStockholm(afp)) != NULL) | |
61 { | |
62 WriteStockholm(stdout, msa); | |
63 MSAFree(msa); | |
64 } | |
65 | |
66 MSAFileClose(afp); | |
67 exit(0); | |
68 } | |
69 /******************************************************************/ | |
70 #endif /* testdriver */ | |
71 | |
72 | |
73 /* Function: ReadStockholm() | |
74 * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis] | |
75 * | |
76 * Purpose: Parse the next alignment from an open Stockholm | |
77 * format alignment file. Return the alignment, or | |
78 * NULL if there are no more alignments in the file. | |
79 * | |
80 * Args: afp - open alignment file | |
81 * | |
82 * Returns: MSA * - an alignment object. | |
83 * caller responsible for an MSAFree() | |
84 * NULL if no more alignments | |
85 * | |
86 * Diagnostics: | |
87 * Will Die() here with a (potentially) useful message | |
88 * if a parsing error occurs | |
89 */ | |
90 MSA * | |
91 ReadStockholm(MSAFILE *afp) | |
92 { | |
93 MSA *msa; | |
94 char *s; | |
95 int status; | |
96 | |
97 if (feof(afp->f)) return NULL; | |
98 | |
99 /* Initialize allocation of the MSA. | |
100 */ | |
101 msa = MSAAlloc(10, 0); | |
102 | |
103 /* Check the magic Stockholm header line. | |
104 * We have to skip blank lines here, else we perceive | |
105 * trailing blank lines in a file as a format error when | |
106 * reading in multi-record mode. | |
107 */ | |
108 do { | |
109 if ((s = MSAFileGetLine(afp)) == NULL) { | |
110 MSAFree(msa); | |
111 return NULL; | |
112 } | |
113 } while (IsBlankline(s)); | |
114 | |
115 if (strncmp(s, "# STOCKHOLM 1.", 14) != 0) | |
116 Die("\ | |
117 File %s doesn't appear to be in Stockholm format.\n\ | |
118 Assuming there isn't some other problem with your file (it is an\n\ | |
119 alignment file, right?), please either:\n\ | |
120 a) use the Babelfish format autotranslator option (-B, usually);\n\ | |
121 b) specify the file's format with the --informat option; or\n\ | |
122 a) reformat the alignment to Stockholm format.\n", | |
123 afp->fname); | |
124 | |
125 /* Read the alignment file one line at a time. | |
126 */ | |
127 while ((s = MSAFileGetLine(afp)) != NULL) | |
128 { | |
129 while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */ | |
130 | |
131 if (*s == '#') { | |
132 if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s); | |
133 else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s); | |
134 else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s); | |
135 else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s); | |
136 else status = parse_comment(msa, s); | |
137 } | |
138 else if (strncmp(s, "//", 2) == 0) break; | |
139 else if (*s == '\n') continue; | |
140 else status = parse_sequence(msa, s); | |
141 | |
142 if (status == 0) | |
143 Die("Stockholm format parse error: line %d of file %s while reading alignment %s", | |
144 afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name); | |
145 } | |
146 | |
147 if (s == NULL && msa->nseq != 0) | |
148 Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name); | |
149 | |
150 if (s == NULL && msa->nseq == 0) { | |
151 /* probably just some junk at end of file */ | |
152 MSAFree(msa); | |
153 return NULL; | |
154 } | |
155 | |
156 MSAVerifyParse(msa); | |
157 return msa; | |
158 } | |
159 | |
160 | |
161 /* Function: WriteStockholm() | |
162 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis] | |
163 * | |
164 * Purpose: Write an alignment in standard multi-block | |
165 * Stockholm format to an open file. A wrapper | |
166 * for actually_write_stockholm(). | |
167 * | |
168 * Args: fp - file that's open for writing | |
169 * msa - alignment to write | |
170 * | |
171 * Returns: (void) | |
172 */ | |
173 void | |
174 WriteStockholm(FILE *fp, MSA *msa) | |
175 { | |
176 actually_write_stockholm(fp, msa, 50); /* 50 char per block */ | |
177 } | |
178 | |
179 /* Function: WriteStockholmOneBlock() | |
180 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis] | |
181 * | |
182 * Purpose: Write an alignment in Pfam's single-block | |
183 * Stockholm format to an open file. A wrapper | |
184 * for actually_write_stockholm(). | |
185 * | |
186 * Args: fp - file that's open for writing | |
187 * msa - alignment to write | |
188 * | |
189 * Returns: (void) | |
190 */ | |
191 void | |
192 WriteStockholmOneBlock(FILE *fp, MSA *msa) | |
193 { | |
194 actually_write_stockholm(fp, msa, msa->alen); /* one big block */ | |
195 } | |
196 | |
197 | |
198 /* Function: actually_write_stockholm() | |
199 * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis] | |
200 * | |
201 * Purpose: Write an alignment in Stockholm format to | |
202 * an open file. This is the function that actually | |
203 * does the work. The API's WriteStockholm() | |
204 * and WriteStockholmOneBlock() are wrappers. | |
205 * | |
206 * Args: fp - file that's open for writing | |
207 * msa - alignment to write | |
208 * cpl - characters to write per line in alignment block | |
209 * | |
210 * Returns: (void) | |
211 */ | |
212 static void | |
213 actually_write_stockholm(FILE *fp, MSA *msa, int cpl) | |
214 { | |
215 int i, j; | |
216 int len = 0; | |
217 int namewidth; | |
218 int typewidth = 0; /* markup tags are up to 5 chars long */ | |
219 int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */ | |
220 char *buf; | |
221 int currpos; | |
222 char *s, *tok; | |
223 | |
224 /* Figure out how much space we need for name + markup | |
225 * to keep the alignment in register. Required by Stockholm | |
226 * spec, even though our Stockholm parser doesn't care (Erik's does). | |
227 */ | |
228 namewidth = 0; | |
229 for (i = 0; i < msa->nseq; i++) | |
230 if ((len = strlen(msa->sqname[i])) > namewidth) | |
231 namewidth = len; | |
232 | |
233 /* Figure out how much space we need for markup tags | |
234 * markupwidth = always 4 if we're doing markup: strlen("#=GR") | |
235 * typewidth = longest markup tag | |
236 */ | |
237 if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; } | |
238 if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; } | |
239 for (i = 0; i < msa->ngr; i++) | |
240 if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len; | |
241 | |
242 if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; } | |
243 if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; } | |
244 if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; } | |
245 for (i = 0; i < msa->ngc; i++) | |
246 if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len; | |
247 | |
248 buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61)); | |
249 | |
250 /* Magic Stockholm header | |
251 */ | |
252 fprintf(fp, "# STOCKHOLM 1.0\n"); | |
253 | |
254 /* Free text comments | |
255 */ | |
256 for (i = 0; i < msa->ncomment; i++) | |
257 fprintf(fp, "# %s\n", msa->comment[i]); | |
258 if (msa->ncomment > 0) fprintf(fp, "\n"); | |
259 | |
260 /* GF section: per-file annotation | |
261 */ | |
262 if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name); | |
263 if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc); | |
264 if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc); | |
265 if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au); | |
266 | |
267 /* Thresholds are hacky. Pfam has two. Rfam has one. | |
268 */ | |
269 if (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2]) | |
270 fprintf(fp, "#=GF GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]); | |
271 else if (msa->cutoff_is_set[MSA_CUTOFF_GA1]) | |
272 fprintf(fp, "#=GF GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]); | |
273 if (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2]) | |
274 fprintf(fp, "#=GF NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]); | |
275 else if (msa->cutoff_is_set[MSA_CUTOFF_NC1]) | |
276 fprintf(fp, "#=GF NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]); | |
277 if (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2]) | |
278 fprintf(fp, "#=GF TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]); | |
279 else if (msa->cutoff_is_set[MSA_CUTOFF_TC1]) | |
280 fprintf(fp, "#=GF TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]); | |
281 | |
282 for (i = 0; i < msa->ngf; i++) | |
283 fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]); | |
284 fprintf(fp, "\n"); | |
285 | |
286 | |
287 /* GS section: per-sequence annotation | |
288 */ | |
289 if (msa->flags & MSA_SET_WGT) | |
290 { | |
291 for (i = 0; i < msa->nseq; i++) | |
292 fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]); | |
293 fprintf(fp, "\n"); | |
294 } | |
295 if (msa->sqacc != NULL) | |
296 { | |
297 for (i = 0; i < msa->nseq; i++) | |
298 if (msa->sqacc[i] != NULL) | |
299 fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]); | |
300 fprintf(fp, "\n"); | |
301 } | |
302 if (msa->sqdesc != NULL) | |
303 { | |
304 for (i = 0; i < msa->nseq; i++) | |
305 if (msa->sqdesc[i] != NULL) | |
306 fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]); | |
307 fprintf(fp, "\n"); | |
308 } | |
309 for (i = 0; i < msa->ngs; i++) | |
310 { | |
311 /* Multiannotated GS tags are possible; for example, | |
312 * #=GS foo DR PDB; 1xxx; | |
313 * #=GS foo DR PDB; 2yyy; | |
314 * These are stored, for example, as: | |
315 * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;" | |
316 * and must be decomposed. | |
317 */ | |
318 for (j = 0; j < msa->nseq; j++) | |
319 if (msa->gs[i][j] != NULL) | |
320 { | |
321 s = msa->gs[i][j]; | |
322 while ((tok = sre_strtok(&s, "\n", NULL)) != NULL) | |
323 fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth, | |
324 msa->sqname[j], msa->gs_tag[i], tok); | |
325 } | |
326 fprintf(fp, "\n"); | |
327 } | |
328 | |
329 /* Alignment section: | |
330 * contains aligned sequence, #=GR annotation, and #=GC annotation | |
331 */ | |
332 for (currpos = 0; currpos < msa->alen; currpos += cpl) | |
333 { | |
334 if (currpos > 0) fprintf(fp, "\n"); | |
335 for (i = 0; i < msa->nseq; i++) | |
336 { | |
337 strncpy(buf, msa->aseq[i] + currpos, cpl); | |
338 buf[cpl] = '\0'; | |
339 fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth, | |
340 msa->sqname[i], buf); | |
341 | |
342 if (msa->ss != NULL && msa->ss[i] != NULL) { | |
343 strncpy(buf, msa->ss[i] + currpos, cpl); | |
344 buf[cpl] = '\0'; | |
345 fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf); | |
346 } | |
347 if (msa->sa != NULL && msa->sa[i] != NULL) { | |
348 strncpy(buf, msa->sa[i] + currpos, cpl); | |
349 buf[cpl] = '\0'; | |
350 fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf); | |
351 } | |
352 for (j = 0; j < msa->ngr; j++) | |
353 if (msa->gr[j][i] != NULL) { | |
354 strncpy(buf, msa->gr[j][i] + currpos, cpl); | |
355 buf[cpl] = '\0'; | |
356 fprintf(fp, "#=GR %-*.*s %5s %s\n", | |
357 namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf); | |
358 } | |
359 } | |
360 if (msa->ss_cons != NULL) { | |
361 strncpy(buf, msa->ss_cons + currpos, cpl); | |
362 buf[cpl] = '\0'; | |
363 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf); | |
364 } | |
365 | |
366 if (msa->sa_cons != NULL) { | |
367 strncpy(buf, msa->sa_cons + currpos, cpl); | |
368 buf[cpl] = '\0'; | |
369 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf); | |
370 } | |
371 | |
372 if (msa->rf != NULL) { | |
373 strncpy(buf, msa->rf + currpos, cpl); | |
374 buf[cpl] = '\0'; | |
375 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf); | |
376 } | |
377 for (j = 0; j < msa->ngc; j++) { | |
378 strncpy(buf, msa->gc[j] + currpos, cpl); | |
379 buf[cpl] = '\0'; | |
380 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, | |
381 msa->gc_tag[j], buf); | |
382 } | |
383 } | |
384 fprintf(fp, "//\n"); | |
385 free(buf); | |
386 } | |
387 | |
388 | |
389 | |
390 | |
391 | |
392 /* Format of a GF line: | |
393 * #=GF <featurename> <text> | |
394 */ | |
395 static int | |
396 parse_gf(MSA *msa, char *buf) | |
397 { | |
398 char *gf; | |
399 char *featurename; | |
400 char *text; | |
401 char *s; | |
402 | |
403 s = buf; | |
404 if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
405 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
406 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0; | |
407 while (*text && (*text == ' ' || *text == '\t')) text++; | |
408 | |
409 if (strcmp(featurename, "ID") == 0) | |
410 msa->name = sre_strdup(text, -1); | |
411 else if (strcmp(featurename, "AC") == 0) | |
412 msa->acc = sre_strdup(text, -1); | |
413 else if (strcmp(featurename, "DE") == 0) | |
414 msa->desc = sre_strdup(text, -1); | |
415 else if (strcmp(featurename, "AU") == 0) | |
416 msa->au = sre_strdup(text, -1); | |
417 else if (strcmp(featurename, "GA") == 0) | |
418 { /* Pfam has GA1, GA2. Rfam just has GA1. */ | |
419 s = text; | |
420 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
421 msa->cutoff[MSA_CUTOFF_GA1] = atof(text); | |
422 msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE; | |
423 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) { | |
424 msa->cutoff[MSA_CUTOFF_GA2] = atof(text); | |
425 msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE; | |
426 } | |
427 } | |
428 else if (strcmp(featurename, "NC") == 0) | |
429 { | |
430 s = text; | |
431 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
432 msa->cutoff[MSA_CUTOFF_NC1] = atof(text); | |
433 msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE; | |
434 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) { | |
435 msa->cutoff[MSA_CUTOFF_NC2] = atof(text); | |
436 msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE; | |
437 } | |
438 } | |
439 else if (strcmp(featurename, "TC") == 0) | |
440 { | |
441 s = text; | |
442 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
443 msa->cutoff[MSA_CUTOFF_TC1] = atof(text); | |
444 msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE; | |
445 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) { | |
446 msa->cutoff[MSA_CUTOFF_TC2] = atof(text); | |
447 msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE; | |
448 } | |
449 } | |
450 else | |
451 MSAAddGF(msa, featurename, text); | |
452 | |
453 return 1; | |
454 } | |
455 | |
456 | |
457 /* Format of a GS line: | |
458 * #=GS <seqname> <featurename> <text> | |
459 */ | |
460 static int | |
461 parse_gs(MSA *msa, char *buf) | |
462 { | |
463 char *gs; | |
464 char *seqname; | |
465 char *featurename; | |
466 char *text; | |
467 int seqidx; | |
468 char *s; | |
469 | |
470 s = buf; | |
471 if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
472 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
473 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
474 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0; | |
475 while (*text && (*text == ' ' || *text == '\t')) text++; | |
476 | |
477 /* GS usually follows another GS; guess lastidx+1 | |
478 */ | |
479 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1); | |
480 msa->lastidx = seqidx; | |
481 | |
482 if (strcmp(featurename, "WT") == 0) | |
483 { | |
484 msa->wgt[seqidx] = atof(text); | |
485 msa->flags |= MSA_SET_WGT; | |
486 } | |
487 | |
488 else if (strcmp(featurename, "AC") == 0) | |
489 MSASetSeqAccession(msa, seqidx, text); | |
490 | |
491 else if (strcmp(featurename, "DE") == 0) | |
492 MSASetSeqDescription(msa, seqidx, text); | |
493 | |
494 else | |
495 MSAAddGS(msa, featurename, seqidx, text); | |
496 | |
497 return 1; | |
498 } | |
499 | |
500 /* Format of a GC line: | |
501 * #=GC <featurename> <text> | |
502 */ | |
503 static int | |
504 parse_gc(MSA *msa, char *buf) | |
505 { | |
506 char *gc; | |
507 char *featurename; | |
508 char *text; | |
509 char *s; | |
510 int len; | |
511 | |
512 s = buf; | |
513 if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
514 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
515 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; | |
516 | |
517 if (strcmp(featurename, "SS_cons") == 0) | |
518 sre_strcat(&(msa->ss_cons), -1, text, len); | |
519 else if (strcmp(featurename, "SA_cons") == 0) | |
520 sre_strcat(&(msa->sa_cons), -1, text, len); | |
521 else if (strcmp(featurename, "RF") == 0) | |
522 sre_strcat(&(msa->rf), -1, text, len); | |
523 else | |
524 MSAAppendGC(msa, featurename, text); | |
525 | |
526 return 1; | |
527 } | |
528 | |
529 /* Format of a GR line: | |
530 * #=GR <seqname> <featurename> <text> | |
531 */ | |
532 static int | |
533 parse_gr(MSA *msa, char *buf) | |
534 { | |
535 char *gr; | |
536 char *seqname; | |
537 char *featurename; | |
538 char *text; | |
539 int seqidx; | |
540 int len; | |
541 int j; | |
542 char *s; | |
543 | |
544 s = buf; | |
545 if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
546 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
547 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
548 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; | |
549 | |
550 /* GR usually follows sequence it refers to; guess msa->lastidx */ | |
551 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx); | |
552 msa->lastidx = seqidx; | |
553 | |
554 if (strcmp(featurename, "SS") == 0) | |
555 { | |
556 if (msa->ss == NULL) | |
557 { | |
558 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); | |
559 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); | |
560 for (j = 0; j < msa->nseqalloc; j++) | |
561 { | |
562 msa->ss[j] = NULL; | |
563 msa->sslen[j] = 0; | |
564 } | |
565 } | |
566 msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len); | |
567 } | |
568 else if (strcmp(featurename, "SA") == 0) | |
569 { | |
570 if (msa->sa == NULL) | |
571 { | |
572 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); | |
573 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); | |
574 for (j = 0; j < msa->nseqalloc; j++) | |
575 { | |
576 msa->sa[j] = NULL; | |
577 msa->salen[j] = 0; | |
578 } | |
579 } | |
580 msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len); | |
581 } | |
582 else | |
583 MSAAppendGR(msa, featurename, seqidx, text); | |
584 | |
585 return 1; | |
586 } | |
587 | |
588 | |
589 /* comments are simply stored verbatim, not parsed | |
590 */ | |
591 static int | |
592 parse_comment(MSA *msa, char *buf) | |
593 { | |
594 char *s; | |
595 char *comment; | |
596 | |
597 s = buf + 1; /* skip leading '#' */ | |
598 if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */ | |
599 else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0; | |
600 | |
601 MSAAddComment(msa, comment); | |
602 return 1; | |
603 } | |
604 | |
605 static int | |
606 parse_sequence(MSA *msa, char *buf) | |
607 { | |
608 char *s; | |
609 char *seqname; | |
610 char *text; | |
611 int seqidx; | |
612 int len; | |
613 | |
614 s = buf; | |
615 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; | |
616 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; | |
617 | |
618 /* seq usually follows another seq; guess msa->lastidx +1 */ | |
619 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1); | |
620 msa->lastidx = seqidx; | |
621 | |
622 msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len); | |
623 return 1; | |
624 } | |
625 | |
626 | |
627 |