Mercurial > repos > calkan > mrsfast
comparison mrsfast-2.3.0.2/MrsFAST.c @ 0:ec628ba33878 default tip
Uploaded source code for mrsFAST
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:39:28 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ec628ba33878 |
---|---|
1 /* | |
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University | |
3 * All rights reserved. | |
4 * | |
5 * Redistribution and use in source and binary forms, with or without modification, | |
6 * are permitted provided that the following conditions are met: | |
7 * | |
8 * Redistributions of source code must retain the above copyright notice, this list | |
9 * of conditions and the following disclaimer. | |
10 * - Redistributions in binary form must reproduce the above copyright notice, this | |
11 * list of conditions and the following disclaimer in the documentation and/or other | |
12 * materials provided with the distribution. | |
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be | |
14 * used to endorse or promote products derived from this software without specific | |
15 * prior written permission. | |
16 * | |
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
25 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 */ | |
29 | |
30 /* | |
31 * Author : Faraz Hach | |
32 * Email : fhach AT cs DOT sfu | |
33 * Last Update : 2009-01-29 | |
34 */ | |
35 | |
36 #include <stdio.h> | |
37 #include <stdlib.h> | |
38 #include <string.h> | |
39 #include <math.h> | |
40 #include "Common.h" | |
41 #include "Reads.h" | |
42 #include "HashTable.h" | |
43 #include "Output.h" | |
44 #include "MrsFAST.h" | |
45 #include "RefGenome.h" | |
46 | |
47 float calculateScore(int index, char *seq, char *qual, int *err); | |
48 unsigned char mrFAST = 0; | |
49 char *versionNumberF="0.2"; | |
50 | |
51 long long verificationCnt = 0; | |
52 long long mappingCnt = 0; | |
53 long long mappedSeqCnt = 0; | |
54 long long completedSeqCnt = 0; | |
55 char *mappingOutput; | |
56 /**********************************************/ | |
57 char *_msf_refGen = NULL; | |
58 int _msf_refGenLength = 0; | |
59 int _msf_refGenOffset = 0; | |
60 char *_msf_refGenName = NULL; | |
61 | |
62 int _msf_refGenBeg; | |
63 int _msf_refGenEnd; | |
64 | |
65 IHashTable *_msf_hashTable = NULL; | |
66 | |
67 int *_msf_samplingLocs; | |
68 int *_msf_samplingLocsEnds; | |
69 int _msf_samplingLocsSize; | |
70 | |
71 Read *_msf_seqList; | |
72 int _msf_seqListSize; | |
73 | |
74 ReadIndexTable *_msf_rIndex = NULL; | |
75 int _msf_rIndexSize; | |
76 int _msf_rIndexMax; | |
77 | |
78 SAM _msf_output; | |
79 | |
80 OPT_FIELDS *_msf_optionalFields; | |
81 | |
82 char *_msf_op; | |
83 | |
84 char _msf_numbers[200][3]; | |
85 char _msf_cigar[5]; | |
86 | |
87 MappingInfo *_msf_mappingInfo; | |
88 | |
89 int *_msf_seqHits; | |
90 int _msf_openFiles = 0; | |
91 int _msf_maxLSize=0; | |
92 int _msf_maxRSize=0; | |
93 /**********************************************/ | |
94 int compare (const void *a, const void *b) | |
95 { | |
96 return ((Pair *)a)->hv - ((Pair *)b)->hv; | |
97 } | |
98 /**********************************************/ | |
99 void preProcessReads() | |
100 { | |
101 int i=0; | |
102 int j=0; | |
103 int pos = 0; | |
104 | |
105 _msf_rIndexMax = -1; | |
106 | |
107 int tmpSize = _msf_seqListSize*_msf_samplingLocsSize*2; | |
108 Pair *tmp = getMem(sizeof(Pair)*tmpSize); | |
109 | |
110 for (i=0; i<_msf_seqListSize; i++) | |
111 { | |
112 for (j=0; j< _msf_samplingLocsSize; j++) | |
113 { | |
114 | |
115 tmp[pos].hv = hashVal(_msf_seqList[i].seq+_msf_samplingLocs[j]); | |
116 tmp[pos].seqInfo = pos; | |
117 pos++; | |
118 } | |
119 for (j=0; j<_msf_samplingLocsSize; j++) | |
120 { | |
121 tmp[pos].hv = hashVal(_msf_seqList[i].rseq+_msf_samplingLocs[j]); | |
122 tmp[pos].seqInfo = pos; | |
123 pos++; | |
124 } | |
125 } | |
126 | |
127 qsort(tmp, tmpSize, sizeof(Pair), compare); | |
128 | |
129 | |
130 int uniq = 0; | |
131 int prev = -2; | |
132 int beg = -1; | |
133 int end = -1; | |
134 | |
135 for (i=0; i<tmpSize; i++) | |
136 { | |
137 if (prev != tmp[i].hv) | |
138 { | |
139 uniq ++; | |
140 prev = tmp[i].hv; | |
141 } | |
142 } | |
143 | |
144 _msf_rIndexSize = uniq; | |
145 _msf_rIndex = getMem(sizeof(ReadIndexTable)*_msf_rIndexSize); | |
146 prev = -2; | |
147 | |
148 j=0; | |
149 beg =0; | |
150 while (beg < tmpSize) | |
151 { | |
152 end = beg; | |
153 while (end+1<tmpSize && tmp[end+1].hv==tmp[beg].hv) | |
154 end++; | |
155 | |
156 _msf_rIndex[j].hv = tmp[beg].hv; | |
157 _msf_rIndex[j].seqInfo = getMem(sizeof(int)*(end-beg+2)); | |
158 _msf_rIndex[j].seqInfo[0] = end-beg+1; | |
159 if ((end-beg+1) > _msf_rIndexMax) | |
160 _msf_rIndexMax = end-beg+1; | |
161 | |
162 for (i=1; i<=_msf_rIndex[j].seqInfo[0]; i++) | |
163 { | |
164 _msf_rIndex[j].seqInfo[i]=tmp[beg+i-1].seqInfo; | |
165 } | |
166 j++; | |
167 beg = end+1; | |
168 } | |
169 freeMem(tmp, sizeof(Pair)*tmpSize); | |
170 } | |
171 /**********************************************/ | |
172 void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName) | |
173 { | |
174 if (_msf_optionalFields == NULL) | |
175 { | |
176 _msf_op = getMem(SEQ_LENGTH); | |
177 if (pairedEndMode) | |
178 { | |
179 _msf_optionalFields = getMem(4*sizeof(OPT_FIELDS)); | |
180 } | |
181 else | |
182 { | |
183 _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS)); | |
184 } | |
185 | |
186 int i; | |
187 for (i=0; i<200;i++) | |
188 { | |
189 sprintf(_msf_numbers[i],"%d%c",i, '\0'); | |
190 } | |
191 sprintf(_msf_cigar, "%dM", SEQ_LENGTH); | |
192 } | |
193 | |
194 if (_msf_samplingLocsEnds == NULL) | |
195 { | |
196 int i; | |
197 _msf_samplingLocs = samplingLocs; | |
198 _msf_samplingLocsSize = samplingLocsSize; | |
199 | |
200 _msf_samplingLocsEnds = malloc(sizeof(int)*_msf_samplingLocsSize); | |
201 for (i=0; i<_msf_samplingLocsSize; i++) | |
202 { | |
203 _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1; | |
204 } | |
205 | |
206 _msf_seqList = seqList; | |
207 _msf_seqListSize = seqListSize; | |
208 | |
209 preProcessReads(); | |
210 } | |
211 if (_msf_refGenName == NULL) | |
212 { | |
213 _msf_refGenName = getMem(SEQ_LENGTH); | |
214 } | |
215 _msf_refGen = getRefGenome(); | |
216 _msf_refGenLength = strlen(_msf_refGen); | |
217 _msf_refGenOffset = getRefGenomeOffset(); | |
218 sprintf(_msf_refGenName,"%s%c", getRefGenomeName(), '\0'); | |
219 | |
220 if (pairedEndMode && _msf_seqHits == NULL) | |
221 { | |
222 _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo)); | |
223 | |
224 int i=0; | |
225 for (i=0; i<seqListSize; i++) | |
226 { | |
227 //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations)); | |
228 _msf_mappingInfo[i].next = NULL; | |
229 _msf_mappingInfo[i].size = 0; | |
230 } | |
231 | |
232 _msf_seqHits = getMem((_msf_seqListSize/2) * sizeof(int)); | |
233 | |
234 | |
235 for (i=0; i<_msf_seqListSize/2; i++) | |
236 { | |
237 _msf_seqHits[i] = 0; | |
238 } | |
239 | |
240 initLoadingRefGenome(genFileName); | |
241 } | |
242 | |
243 if (_msf_refGenOffset == 0) | |
244 { | |
245 _msf_refGenBeg = 1; | |
246 } | |
247 else | |
248 { | |
249 _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2; | |
250 } | |
251 _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1; | |
252 | |
253 | |
254 } | |
255 /**********************************************/ | |
256 void finalizeFAST() | |
257 { | |
258 freeMem(_msf_seqHits, (_msf_seqListSize/2) * sizeof(int)); | |
259 freeMem(_msf_refGenName, SEQ_LENGTH); | |
260 int i; | |
261 for (i=0; i<_msf_rIndexSize; i++) | |
262 { | |
263 freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1); | |
264 } | |
265 freeMem(_msf_rIndex, _msf_rIndexSize); | |
266 } | |
267 | |
268 /**********************************************/ | |
269 int verifySingleEnd(int index, char* seq, int offset) | |
270 { | |
271 int curOff = 0; | |
272 int i; | |
273 | |
274 char *ref; | |
275 | |
276 int err; | |
277 int errCnt =0; | |
278 int errCntOff = 0; | |
279 int NCntOff = 0; | |
280 | |
281 int matchCnt = 0; | |
282 int pp = 0; | |
283 | |
284 ref = _msf_refGen + index - 1; | |
285 | |
286 verificationCnt++; | |
287 | |
288 for (i = 0; i < SEQ_LENGTH; i++) | |
289 { | |
290 err = *ref != *seq; | |
291 | |
292 errCnt += err; | |
293 if (errCnt > errThreshold) | |
294 { | |
295 | |
296 return -1; | |
297 } | |
298 | |
299 if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff]) | |
300 { | |
301 errCntOff += err; | |
302 NCntOff += (*seq == 'N'); | |
303 } | |
304 else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1]) | |
305 { | |
306 | |
307 if (errCntOff == 0 && NCntOff == 0 && offset > curOff) | |
308 { | |
309 return -1; | |
310 } | |
311 | |
312 errCntOff = 0; | |
313 NCntOff = 0; | |
314 curOff++; | |
315 | |
316 if ( i >= _msf_samplingLocs[curOff]) | |
317 { | |
318 errCntOff += err; | |
319 NCntOff += (*seq == 'N'); | |
320 } | |
321 } | |
322 | |
323 ref++; | |
324 seq++; | |
325 } | |
326 return errCnt; | |
327 } | |
328 | |
329 /**********************************************/ | |
330 int calculateMD(int index, char *seq, int err, char **opSeq) | |
331 { | |
332 int i; | |
333 char *ref; | |
334 char *ver; | |
335 short matchCnt = 0; | |
336 char *op = *opSeq; | |
337 int pp = 0; | |
338 | |
339 ref = _msf_refGen + index-1; | |
340 ver = seq; | |
341 | |
342 if (err>0 || err == -1 ) | |
343 { | |
344 | |
345 err = 0; | |
346 for (i=0; i < SEQ_LENGTH; i++) | |
347 { | |
348 if (* ref != *ver) | |
349 { | |
350 err++; | |
351 if (matchCnt) | |
352 { | |
353 if (matchCnt < 10) | |
354 { | |
355 op[pp++]=_msf_numbers[matchCnt][0]; | |
356 } | |
357 else if (matchCnt < 100) | |
358 { | |
359 op[pp++]=_msf_numbers[matchCnt][0]; | |
360 op[pp++]=_msf_numbers[matchCnt][1]; | |
361 } | |
362 else | |
363 { | |
364 op[pp++]=_msf_numbers[matchCnt][0]; | |
365 op[pp++]=_msf_numbers[matchCnt][1]; | |
366 op[pp++]=_msf_numbers[matchCnt][2]; | |
367 } | |
368 | |
369 matchCnt = 0; | |
370 } | |
371 op[pp++]=*ref; | |
372 } | |
373 else | |
374 { | |
375 matchCnt++; | |
376 } | |
377 ref++; | |
378 ver++; | |
379 } | |
380 | |
381 } | |
382 if (err == 0) | |
383 { | |
384 matchCnt = SEQ_LENGTH; | |
385 } | |
386 | |
387 if (matchCnt>0) | |
388 { | |
389 if (matchCnt < 10) | |
390 { | |
391 op[pp++]=_msf_numbers[matchCnt][0]; | |
392 } | |
393 else if (matchCnt < 100) | |
394 { | |
395 op[pp++]=_msf_numbers[matchCnt][0]; | |
396 op[pp++]=_msf_numbers[matchCnt][1]; | |
397 } | |
398 else | |
399 { | |
400 op[pp++]=_msf_numbers[matchCnt][0]; | |
401 op[pp++]=_msf_numbers[matchCnt][1]; | |
402 op[pp++]=_msf_numbers[matchCnt][2]; | |
403 } | |
404 } | |
405 op[pp]='\0'; | |
406 | |
407 return err; | |
408 } | |
409 | |
410 /**********************************************/ | |
411 void mapSingleEndSeqListBal(unsigned int *l1, int s1, unsigned int *l2, int s2, int dir) | |
412 { | |
413 | |
414 if (s1 == 0 || s2 == 0) | |
415 { | |
416 return; | |
417 } | |
418 else if (s1 == s2 && s1 <= 50) | |
419 { | |
420 | |
421 int j = 0; | |
422 int z = 0; | |
423 int *locs; | |
424 int *seqInfo; | |
425 char *_tmpSeq, *_tmpQual; | |
426 char rqual[QUAL_LENGTH+1]; | |
427 rqual[QUAL_LENGTH]='\0'; | |
428 | |
429 if (dir > 0) | |
430 { | |
431 locs = (int *) l1; | |
432 seqInfo = (int *) l2; | |
433 } | |
434 else | |
435 { | |
436 locs = (int *) l2; | |
437 seqInfo = (int *) l1; | |
438 } | |
439 | |
440 | |
441 for (j=0; j<s2; j++) | |
442 { | |
443 int re = _msf_samplingLocsSize * 2; | |
444 int r = seqInfo[j]/re; | |
445 if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) | |
446 { | |
447 continue; | |
448 } | |
449 | |
450 int x = seqInfo[j] % re; | |
451 int o = x % _msf_samplingLocsSize; | |
452 char d = (x/_msf_samplingLocsSize)?1:0; | |
453 | |
454 | |
455 if (d) | |
456 { | |
457 reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH); | |
458 _tmpQual = rqual; | |
459 _tmpSeq = _msf_seqList[r].rseq; | |
460 } | |
461 else | |
462 { | |
463 _tmpQual = _msf_seqList[r].qual; | |
464 _tmpSeq = _msf_seqList[r].seq; | |
465 } | |
466 | |
467 | |
468 for (z=0; z<s1; z++) | |
469 { | |
470 int genLoc = locs[z]-_msf_samplingLocs[o]; | |
471 | |
472 | |
473 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) | |
474 continue; | |
475 | |
476 int err = -1; | |
477 | |
478 | |
479 | |
480 err = verifySingleEnd(genLoc, _tmpSeq, o); | |
481 | |
482 | |
483 | |
484 if (err != -1) | |
485 { | |
486 calculateMD(genLoc, _tmpSeq, err, &_msf_op); | |
487 mappingCnt++; | |
488 _msf_seqList[r].hits[0]++; | |
489 | |
490 _msf_output.QNAME = _msf_seqList[r].name; | |
491 _msf_output.FLAG = 16 * d; | |
492 _msf_output.RNAME = _msf_refGenName; | |
493 _msf_output.POS = genLoc + _msf_refGenOffset; | |
494 _msf_output.MAPQ = 255; | |
495 _msf_output.CIGAR = _msf_cigar; | |
496 _msf_output.MRNAME = "*"; | |
497 _msf_output.MPOS = 0; | |
498 _msf_output.ISIZE = 0; | |
499 _msf_output.SEQ = _tmpSeq; | |
500 _msf_output.QUAL = _tmpQual; | |
501 | |
502 _msf_output.optSize = 2; | |
503 _msf_output.optFields = _msf_optionalFields; | |
504 | |
505 _msf_optionalFields[0].tag = "NM"; | |
506 _msf_optionalFields[0].type = 'i'; | |
507 _msf_optionalFields[0].iVal = err; | |
508 | |
509 _msf_optionalFields[1].tag = "MD"; | |
510 _msf_optionalFields[1].type = 'Z'; | |
511 _msf_optionalFields[1].sVal = _msf_op; | |
512 | |
513 output(_msf_output); | |
514 | |
515 | |
516 if (_msf_seqList[r].hits[0] == 1) | |
517 { | |
518 mappedSeqCnt++; | |
519 } | |
520 | |
521 if ( maxHits == 0 ) | |
522 { | |
523 _msf_seqList[r].hits[0] = 2; | |
524 } | |
525 | |
526 | |
527 if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) | |
528 { | |
529 completedSeqCnt++; | |
530 break; | |
531 } | |
532 } | |
533 | |
534 } | |
535 } | |
536 } | |
537 else | |
538 { | |
539 int tmp1=s1/2, tmp2= s2/2; | |
540 if (tmp1 != 0) | |
541 mapSingleEndSeqListBal(l1, tmp1, l2+tmp2, s2-tmp2, dir); | |
542 mapSingleEndSeqListBal(l2+tmp2, s2-tmp2, l1+tmp1, s1-tmp1, -dir); | |
543 if (tmp2 !=0) | |
544 mapSingleEndSeqListBal(l1+tmp1, s1-tmp1, l2, tmp2, dir); | |
545 if (tmp1 + tmp2 != 0) | |
546 mapSingleEndSeqListBal(l2, tmp2, l1, tmp1, -dir); | |
547 } | |
548 } | |
549 | |
550 | |
551 /**********************************************/ | |
552 void mapSingleEndSeqListTOP(unsigned int *l1, int s1, unsigned int *l2, int s2) | |
553 { | |
554 if (s1 < s2) | |
555 { | |
556 mapSingleEndSeqListBal(l1, s1, l2, s1,1); | |
557 mapSingleEndSeqListTOP(l1, s1, l2+s1, s2-s1); | |
558 } | |
559 else if (s1 > s2) | |
560 { | |
561 mapSingleEndSeqListBal(l1, s2, l2, s2,1); | |
562 mapSingleEndSeqListTOP(l1+s2, s1-s2, l2, s2); | |
563 } | |
564 else | |
565 { | |
566 mapSingleEndSeqListBal(l1, s1, l2, s2,1); | |
567 } | |
568 } | |
569 | |
570 | |
571 /**********************************************/ | |
572 void mapSingleEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2) | |
573 { | |
574 if ( s2/s1 <= 2) | |
575 { | |
576 int j = 0; | |
577 int z = 0; | |
578 int *locs = (int *) l1; | |
579 int *seqInfo = (int *) l2; | |
580 char *_tmpSeq, *_tmpQual; | |
581 char rqual[QUAL_LENGTH+1]; | |
582 rqual[QUAL_LENGTH]='\0'; | |
583 | |
584 for (j=0; j<s2; j++) | |
585 { | |
586 int re = _msf_samplingLocsSize * 2; | |
587 int r = seqInfo[j]/re; | |
588 if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) | |
589 { | |
590 continue; | |
591 } | |
592 | |
593 int x = seqInfo[j] % re; | |
594 int o = x % _msf_samplingLocsSize; | |
595 char d = (x/_msf_samplingLocsSize)?1:0; | |
596 | |
597 | |
598 if (d) | |
599 { | |
600 reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH); | |
601 _tmpQual = rqual; | |
602 _tmpSeq = _msf_seqList[r].rseq; | |
603 } | |
604 else | |
605 { | |
606 _tmpQual = _msf_seqList[r].qual; | |
607 _tmpSeq = _msf_seqList[r].seq; | |
608 } | |
609 | |
610 | |
611 for (z=0; z<s1; z++) | |
612 { | |
613 int genLoc = locs[z]-_msf_samplingLocs[o]; | |
614 | |
615 | |
616 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) | |
617 continue; | |
618 | |
619 int err = -1; | |
620 | |
621 | |
622 | |
623 err = verifySingleEnd(genLoc, _tmpSeq, o); | |
624 | |
625 | |
626 | |
627 if (err != -1) | |
628 { | |
629 calculateMD(genLoc, _tmpSeq, err, &_msf_op); | |
630 mappingCnt++; | |
631 _msf_seqList[r].hits[0]++; | |
632 | |
633 _msf_output.QNAME = _msf_seqList[r].name; | |
634 _msf_output.FLAG = 16 * d; | |
635 _msf_output.RNAME = _msf_refGenName; | |
636 _msf_output.POS = genLoc + _msf_refGenOffset; | |
637 _msf_output.MAPQ = 255; | |
638 _msf_output.CIGAR = _msf_cigar; | |
639 _msf_output.MRNAME = "*"; | |
640 _msf_output.MPOS = 0; | |
641 _msf_output.ISIZE = 0; | |
642 _msf_output.SEQ = _tmpSeq; | |
643 _msf_output.QUAL = _tmpQual; | |
644 | |
645 _msf_output.optSize = 2; | |
646 _msf_output.optFields = _msf_optionalFields; | |
647 | |
648 _msf_optionalFields[0].tag = "NM"; | |
649 _msf_optionalFields[0].type = 'i'; | |
650 _msf_optionalFields[0].iVal = err; | |
651 | |
652 _msf_optionalFields[1].tag = "MD"; | |
653 _msf_optionalFields[1].type = 'Z'; | |
654 _msf_optionalFields[1].sVal = _msf_op; | |
655 | |
656 output(_msf_output); | |
657 | |
658 | |
659 if (_msf_seqList[r].hits[0] == 1) | |
660 { | |
661 mappedSeqCnt++; | |
662 } | |
663 | |
664 if ( maxHits == 0 ) | |
665 { | |
666 _msf_seqList[r].hits[0] = 2; | |
667 } | |
668 | |
669 | |
670 if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) | |
671 { | |
672 completedSeqCnt++; | |
673 break; | |
674 } | |
675 } | |
676 | |
677 } | |
678 } | |
679 } | |
680 else if (s1 == 1) | |
681 { | |
682 // fprintf(stderr, "1"); | |
683 int tmp = s2/2; | |
684 mapSingleEndSeqList(l1, s1, l2, tmp); | |
685 mapSingleEndSeqList(l1, s1, l2+tmp, s2-tmp); | |
686 } | |
687 else if (s2 == 1) | |
688 { | |
689 // fprintf(stderr, "2"); | |
690 int tmp = s1/2; | |
691 mapSingleEndSeqList(l1, tmp, l2, s2); | |
692 mapSingleEndSeqList(l1+tmp, s1-tmp, l2, s2); | |
693 } | |
694 else | |
695 { | |
696 // fprintf(stderr, "3"); | |
697 int tmp1=s1/2, tmp2= s2/2; | |
698 mapSingleEndSeqList(l1, tmp1, l2, tmp2); | |
699 mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2); | |
700 mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2); | |
701 mapSingleEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2); | |
702 } | |
703 } | |
704 /**********************************************/ | |
705 int mapSingleEndSeq() | |
706 { | |
707 int i = 0; | |
708 unsigned int *locs = NULL; | |
709 unsigned int *seqInfo = NULL; | |
710 while ( i < _msf_rIndexSize ) | |
711 { | |
712 | |
713 locs = getCandidates (_msf_rIndex[i].hv); | |
714 if ( locs != NULL) | |
715 { | |
716 seqInfo = _msf_rIndex[i].seqInfo; | |
717 mapSingleEndSeqListTOP (locs+1, locs[0], seqInfo+1, seqInfo[0]); | |
718 } | |
719 i++; | |
720 } | |
721 return 1; | |
722 } | |
723 | |
724 | |
725 /**********************************************/ | |
726 /**********************************************/ | |
727 /**********************************************/ | |
728 /**********************************************/ | |
729 /**********************************************/ | |
730 int compareOut (const void *a, const void *b) | |
731 { | |
732 FullMappingInfo *aInfo = (FullMappingInfo *)a; | |
733 FullMappingInfo *bInfo = (FullMappingInfo *)b; | |
734 return aInfo->loc - bInfo->loc; | |
735 } | |
736 | |
737 /**********************************************/ | |
738 void mapPairedEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2) | |
739 { | |
740 if ( s2/s1 <= 2) | |
741 { | |
742 int j = 0; | |
743 int z = 0; | |
744 int *locs = (int *) l1; | |
745 int *seqInfo = (int *) l2; | |
746 char *_tmpSeq, *_tmpQual; | |
747 char rqual[QUAL_LENGTH+1]; | |
748 rqual[QUAL_LENGTH]='\0'; | |
749 | |
750 for (j=0; j<s2; j++) | |
751 { | |
752 int re = _msf_samplingLocsSize * 2; | |
753 int r = seqInfo[j]/re; | |
754 | |
755 if (pairedEndDiscordantMode && (_msf_seqList[r].hits[0] == 1 || (_msf_seqHits[r/2] > DISCORDANT_CUT_OFF) )) | |
756 { | |
757 continue; | |
758 } | |
759 | |
760 int x = seqInfo[j] % re; | |
761 int o = x % _msf_samplingLocsSize; | |
762 char d = (x/_msf_samplingLocsSize)?-1:1; | |
763 | |
764 | |
765 if (d==-1) | |
766 { | |
767 _tmpSeq = _msf_seqList[r].rseq; | |
768 } | |
769 else | |
770 { | |
771 _tmpSeq = _msf_seqList[r].seq; | |
772 } | |
773 | |
774 | |
775 for (z=0; z<s1; z++) | |
776 { | |
777 int genLoc = locs[z]-_msf_samplingLocs[o]; | |
778 | |
779 | |
780 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) | |
781 continue; | |
782 | |
783 int err = -1; | |
784 | |
785 | |
786 | |
787 err = verifySingleEnd(genLoc, _tmpSeq, o); | |
788 | |
789 | |
790 if (err != -1) | |
791 { | |
792 MappingLocations *parent = NULL; | |
793 MappingLocations *child = _msf_mappingInfo[r].next; | |
794 | |
795 genLoc+= _msf_refGenOffset; | |
796 | |
797 int i = 0; | |
798 for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++) | |
799 { | |
800 parent = child; | |
801 child = child->next; | |
802 } | |
803 | |
804 if (child==NULL) | |
805 { | |
806 MappingLocations *tmp = getMem(sizeof(MappingLocations)); | |
807 tmp->next = NULL; | |
808 tmp->loc[0]=genLoc * d; | |
809 if (parent == NULL) | |
810 _msf_mappingInfo[r].next = tmp; | |
811 else | |
812 parent->next = tmp; | |
813 } | |
814 else | |
815 { | |
816 child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d; | |
817 } | |
818 | |
819 | |
820 | |
821 _msf_mappingInfo[r].size++; | |
822 | |
823 } | |
824 | |
825 } | |
826 } | |
827 } | |
828 else if (s1 == 1) | |
829 { | |
830 int tmp = s2/2; | |
831 mapPairedEndSeqList(l1, s1, l2, tmp); | |
832 mapPairedEndSeqList(l1, s1, l2+tmp, s2-tmp); | |
833 } | |
834 else if (s2 == 1) | |
835 { | |
836 int tmp = s1/2; | |
837 mapPairedEndSeqList(l1, tmp, l2, s2); | |
838 mapPairedEndSeqList(l1+tmp, s1-tmp, l2, s2); | |
839 } | |
840 else | |
841 { | |
842 int tmp1=s1/2, tmp2= s2/2; | |
843 mapPairedEndSeqList(l1, tmp1, l2, tmp2); | |
844 mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2); | |
845 mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2); | |
846 mapPairedEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2); | |
847 } | |
848 } | |
849 | |
850 /**********************************************/ | |
851 int mapPairedEndSeq() | |
852 { | |
853 int i = 0; | |
854 unsigned int *locs = NULL; | |
855 unsigned int *seqInfo = NULL; | |
856 while ( i < _msf_rIndexSize ) | |
857 { | |
858 locs = getCandidates (_msf_rIndex[i].hv); | |
859 if ( locs != NULL) | |
860 { | |
861 seqInfo = _msf_rIndex[i].seqInfo; | |
862 mapPairedEndSeqList(locs+1, locs[0], seqInfo+1, seqInfo[0]); | |
863 | |
864 } | |
865 i++; | |
866 } | |
867 | |
868 | |
869 char fname1[FILE_NAME_LENGTH]; | |
870 char fname2[FILE_NAME_LENGTH]; | |
871 MappingLocations *cur, *tmp; | |
872 int tmpOut; | |
873 int j; | |
874 int lmax=0, rmax=0; | |
875 | |
876 sprintf(fname1, "%s__%s__%d__1",mappingOutputPath, mappingOutput, _msf_openFiles); | |
877 sprintf(fname2, "%s__%s__%d__2",mappingOutputPath, mappingOutput, _msf_openFiles); | |
878 | |
879 FILE* out; | |
880 FILE* out1 = fileOpen(fname1, "w"); | |
881 FILE* out2 = fileOpen(fname2, "w"); | |
882 | |
883 _msf_openFiles++; | |
884 | |
885 for (i=0; i<_msf_seqListSize; i++) | |
886 { | |
887 | |
888 if (i%2==0) | |
889 { | |
890 out = out1; | |
891 | |
892 if (lmax < _msf_mappingInfo[i].size) | |
893 { | |
894 lmax = _msf_mappingInfo[i].size; | |
895 } | |
896 } | |
897 else | |
898 { | |
899 out = out2; | |
900 if (rmax < _msf_mappingInfo[i].size) | |
901 { | |
902 rmax = _msf_mappingInfo[i].size; | |
903 } | |
904 } | |
905 | |
906 tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out); | |
907 if (_msf_mappingInfo[i].size > 0) | |
908 { | |
909 cur = _msf_mappingInfo[i].next; | |
910 for (j=0; j < _msf_mappingInfo[i].size; j++) | |
911 { | |
912 if ( j>0 && j%MAP_CHUNKS==0) | |
913 { | |
914 cur = cur->next; | |
915 } | |
916 tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out); | |
917 } | |
918 _msf_mappingInfo[i].size = 0; | |
919 // _msf_mappingInfo[i].next = NULL; | |
920 } | |
921 } | |
922 | |
923 _msf_maxLSize += lmax; | |
924 _msf_maxRSize += rmax; | |
925 | |
926 fclose(out1); | |
927 fclose(out2); | |
928 | |
929 //fprintf(stdout, "%d %d\n", _msf_maxLSize, _msf_maxRSize); | |
930 | |
931 } | |
932 | |
933 /**********************************************/ | |
934 void outputPairedEnd() | |
935 { | |
936 | |
937 char *curGen; | |
938 char *curGenName; | |
939 int tmpOut; | |
940 | |
941 loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut); | |
942 | |
943 FILE* in1[_msf_openFiles]; | |
944 FILE* in2[_msf_openFiles]; | |
945 | |
946 char fname1[_msf_openFiles][FILE_NAME_LENGTH]; | |
947 char fname2[_msf_openFiles][FILE_NAME_LENGTH]; | |
948 | |
949 // discordant | |
950 FILE *out, *out1, *out2; | |
951 | |
952 char fname3[FILE_NAME_LENGTH]; | |
953 char fname4[FILE_NAME_LENGTH]; | |
954 char fname5[FILE_NAME_LENGTH]; | |
955 | |
956 if (pairedEndDiscordantMode) | |
957 { | |
958 sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); | |
959 sprintf(fname4, "%s__%s__oea1", mappingOutputPath, mappingOutput); | |
960 sprintf(fname5, "%s__%s__oea2", mappingOutputPath, mappingOutput); | |
961 out = fileOpen(fname3, "a"); | |
962 out1 = fileOpen(fname4, "a"); | |
963 out2 = fileOpen(fname5, "a"); | |
964 } | |
965 | |
966 | |
967 | |
968 int i; | |
969 | |
970 FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize); | |
971 FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize); | |
972 | |
973 | |
974 for (i=0; i<_msf_openFiles; i++) | |
975 { | |
976 sprintf(fname1[i], "%s__%s__%d__1", mappingOutputPath, mappingOutput, i); | |
977 sprintf(fname2[i], "%s__%s__%d__2", mappingOutputPath, mappingOutput, i); | |
978 in1[i] = fileOpen(fname1[i], "r"); | |
979 in2[i] = fileOpen(fname2[i], "r"); | |
980 } | |
981 | |
982 | |
983 int size; | |
984 int j, k; | |
985 int size1, size2; | |
986 | |
987 for (i=0; i<_msf_seqListSize/2; i++) | |
988 { | |
989 size1 = size2 = 0; | |
990 for (j=0; j<_msf_openFiles; j++) | |
991 { | |
992 tmpOut = fread(&size, sizeof(int), 1, in1[j]); | |
993 if ( size > 0 ) | |
994 { | |
995 for (k=0; k<size; k++) | |
996 { | |
997 | |
998 mi1[size1+k].dir = 1; | |
999 tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]); | |
1000 if (mi1[size1+k].loc<1) | |
1001 { | |
1002 mi1[size1+k].loc *= -1; | |
1003 mi1[size1+k].dir = -1; | |
1004 } | |
1005 } | |
1006 qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut); | |
1007 size1+=size; | |
1008 } | |
1009 } | |
1010 | |
1011 for (j=0; j<_msf_openFiles; j++) | |
1012 { | |
1013 tmpOut = fread(&size, sizeof(int), 1, in2[j]); | |
1014 if ( size > 0 ) | |
1015 { | |
1016 for (k=0; k<size; k++) | |
1017 { | |
1018 | |
1019 mi2[size2+k].dir = 1; | |
1020 tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]); | |
1021 | |
1022 if (mi2[size2+k].loc<1) | |
1023 { | |
1024 mi2[size2+k].loc *= -1; | |
1025 mi2[size2+k].dir = -1; | |
1026 } | |
1027 } | |
1028 qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut); | |
1029 size2+=size; | |
1030 } | |
1031 } | |
1032 | |
1033 //if (i == 6615) | |
1034 // fprintf(stdout, "%d: %s %d %d ",i, _msf_seqList[i*2].name, size1, size2); | |
1035 | |
1036 int lm, ll, rl, rm; | |
1037 int pos = 0; | |
1038 | |
1039 if (pairedEndDiscordantMode) | |
1040 { | |
1041 | |
1042 for (j=0; j<size1; j++) | |
1043 { | |
1044 lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1; | |
1045 ll = mi1[j].loc - minPairEndedDiscordantDistance + 1; | |
1046 rl = mi1[j].loc + minPairEndedDiscordantDistance - 1; | |
1047 rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1; | |
1048 | |
1049 while (pos<size2 && mi2[pos].loc < lm) | |
1050 { | |
1051 pos++; | |
1052 } | |
1053 | |
1054 k = pos; | |
1055 while (k<size2 && mi2[k].loc<=rm) | |
1056 { | |
1057 if ( mi2[k].loc <= ll || mi2[k].loc >= rl) | |
1058 { | |
1059 if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) || | |
1060 (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) ) | |
1061 { | |
1062 _msf_seqList[i*2].hits[0]=1; | |
1063 _msf_seqList[i*2+1].hits[0]=1; | |
1064 size1=0; | |
1065 size2=0; | |
1066 break; | |
1067 } | |
1068 } | |
1069 k++; | |
1070 } | |
1071 } | |
1072 | |
1073 _msf_seqHits[i]+= size1*size2; | |
1074 if (_msf_seqHits[i]> DISCORDANT_CUT_OFF) | |
1075 { | |
1076 _msf_seqList[i*2].hits[0]=1; | |
1077 _msf_seqList[i*2+1].hits[0]=1; | |
1078 size1=0; | |
1079 size2=0; | |
1080 } | |
1081 } | |
1082 //if (i == 6615) | |
1083 // fprintf(stdout, "%d %d\n", size1, size2); | |
1084 | |
1085 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; | |
1086 char rqual1[QUAL_LENGTH+1], rqual2[QUAL_LENGTH+1]; | |
1087 rqual1[QUAL_LENGTH] = rqual2[QUAL_LENGTH] = '\0'; | |
1088 seq1 = _msf_seqList[i*2].seq; | |
1089 rseq1 = _msf_seqList[i*2].rseq; | |
1090 qual1 = _msf_seqList[i*2].qual; | |
1091 reverse(_msf_seqList[i*2].qual, rqual1, QUAL_LENGTH); | |
1092 | |
1093 seq2 = _msf_seqList[i*2+1].seq; | |
1094 rseq2 = _msf_seqList[i*2+1].rseq; | |
1095 qual2 = _msf_seqList[i*2+1].qual; | |
1096 reverse(_msf_seqList[i*2+1].qual, rqual2, QUAL_LENGTH); | |
1097 | |
1098 | |
1099 if (pairedEndDiscordantMode) | |
1100 { | |
1101 for (k=0; k<size1; k++) | |
1102 { | |
1103 int tm = -1; | |
1104 mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, &tm); | |
1105 mi1[k].err = tm; | |
1106 } | |
1107 | |
1108 for (k=0; k<size2; k++) | |
1109 { | |
1110 int tm = -1; | |
1111 mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, &tm); | |
1112 mi2[k].err = tm; | |
1113 } | |
1114 | |
1115 } | |
1116 else | |
1117 { | |
1118 for (k=0; k<size1; k++) | |
1119 { | |
1120 mi1[k].err = calculateMD(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, -1, &_msf_op); | |
1121 sprintf(mi1[k].md, "%s", _msf_op); | |
1122 } | |
1123 | |
1124 for (k=0; k<size2; k++) | |
1125 { | |
1126 mi2[k].err = calculateMD(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, -1, &_msf_op); | |
1127 sprintf(mi2[k].md, "%s", _msf_op); | |
1128 } | |
1129 } | |
1130 pos = 0; | |
1131 | |
1132 for (j=0; j<size1; j++) | |
1133 { | |
1134 lm = mi1[j].loc - maxPairEndedDistance + 1; | |
1135 ll = mi1[j].loc - minPairEndedDistance + 1; | |
1136 rl = mi1[j].loc + minPairEndedDistance - 1; | |
1137 rm = mi1[j].loc + maxPairEndedDistance - 1; | |
1138 | |
1139 //fprintf(stdout, "%d %d %d %d %d\n",lm, ll,mi1[j].loc ,rl, rm); | |
1140 | |
1141 while (pos<size2 && mi2[pos].loc < lm) | |
1142 { | |
1143 pos++; | |
1144 } | |
1145 | |
1146 //fprintf(stdout, "POS: %d %d \n", pos, mi2[pos].loc); | |
1147 | |
1148 k = pos; | |
1149 while (k<size2 && mi2[k].loc <= rm) | |
1150 { | |
1151 if (mi2[k].loc <= ll || mi2[k].loc >= rl) | |
1152 { | |
1153 if (pairedEndDiscordantMode) | |
1154 { | |
1155 int tmp; | |
1156 int rNo = i; | |
1157 int loc = mi1[j].loc*mi1[j].dir; | |
1158 int err = mi1[j].err; | |
1159 float sc = mi1[j].score; | |
1160 | |
1161 char l = strlen(_msf_refGenName); | |
1162 | |
1163 tmp = fwrite(&rNo, sizeof(int), 1, out); | |
1164 | |
1165 tmp = fwrite(&l, sizeof(char), 1, out); | |
1166 tmp = fwrite(_msf_refGenName, sizeof(char), l, out); | |
1167 | |
1168 tmp = fwrite(&loc, sizeof(int), 1, out); | |
1169 tmp = fwrite(&err, sizeof(char), 1, out); | |
1170 tmp = fwrite(&sc, sizeof(float), 1, out); | |
1171 | |
1172 | |
1173 loc = mi2[k].loc*mi2[k].dir; | |
1174 err = mi2[k].err; | |
1175 sc = mi2[k].score; | |
1176 | |
1177 tmp = fwrite(&loc, sizeof(int), 1, out); | |
1178 tmp = fwrite(&err, sizeof(char), 1, out); | |
1179 tmp = fwrite(&sc, sizeof(float), 1, out); | |
1180 } // end discordant | |
1181 else | |
1182 { //start sampe | |
1183 char *seq; | |
1184 char *qual; | |
1185 char d1; | |
1186 char d2; | |
1187 int isize; | |
1188 int proper=0; | |
1189 // ISIZE CALCULATION | |
1190 // The distance between outer edges | |
1191 isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-1; | |
1192 if (mi1[j].loc - mi2[k].loc > 0) | |
1193 { | |
1194 isize *= -1; | |
1195 } | |
1196 | |
1197 d1 = (mi1[j].dir == -1)?1:0; | |
1198 d2 = (mi2[k].dir == -1)?1:0; | |
1199 | |
1200 if ( d1 ) | |
1201 { | |
1202 seq = rseq1; | |
1203 qual = rqual1; | |
1204 } | |
1205 else | |
1206 { | |
1207 seq = seq1; | |
1208 qual = qual1; | |
1209 } | |
1210 | |
1211 if ( (mi1[j].loc < mi2[k].loc && !d1 && d2) || | |
1212 (mi1[j].loc > mi2[k].loc && d1 && !d2) ) | |
1213 { | |
1214 proper = 2; | |
1215 } | |
1216 else | |
1217 { | |
1218 proper = 0; | |
1219 } | |
1220 | |
1221 | |
1222 _msf_output.POS = mi1[j].loc; | |
1223 _msf_output.MPOS = mi2[k].loc; | |
1224 _msf_output.FLAG = 1+proper+16*d1+32*d2+64; | |
1225 _msf_output.ISIZE = isize; | |
1226 _msf_output.SEQ = seq, | |
1227 _msf_output.QUAL = qual; | |
1228 _msf_output.QNAME = _msf_seqList[i*2].name; | |
1229 _msf_output.RNAME = _msf_refGenName; | |
1230 _msf_output.MAPQ = 255; | |
1231 _msf_output.CIGAR = _msf_cigar; | |
1232 _msf_output.MRNAME = "="; | |
1233 | |
1234 _msf_output.optSize = 2; | |
1235 _msf_output.optFields = _msf_optionalFields; | |
1236 | |
1237 _msf_optionalFields[0].tag = "NM"; | |
1238 _msf_optionalFields[0].type = 'i'; | |
1239 _msf_optionalFields[0].iVal = mi1[j].err; | |
1240 | |
1241 _msf_optionalFields[1].tag = "MD"; | |
1242 _msf_optionalFields[1].type = 'Z'; | |
1243 _msf_optionalFields[1].sVal = mi1[j].md; | |
1244 | |
1245 | |
1246 output(_msf_output); | |
1247 | |
1248 if ( d2 ) | |
1249 { | |
1250 seq = rseq2; | |
1251 qual = rqual2; | |
1252 } | |
1253 else | |
1254 { | |
1255 seq = seq2; | |
1256 qual = qual2; | |
1257 } | |
1258 | |
1259 _msf_output.POS = mi2[k].loc; | |
1260 _msf_output.MPOS = mi1[j].loc; | |
1261 _msf_output.FLAG = 1+proper+16*d2+32*d1+128; | |
1262 _msf_output.ISIZE = -isize; | |
1263 _msf_output.SEQ = seq, | |
1264 _msf_output.QUAL = qual; | |
1265 _msf_output.QNAME = _msf_seqList[i*2].name; | |
1266 _msf_output.RNAME = _msf_refGenName; | |
1267 _msf_output.MAPQ = 255; | |
1268 _msf_output.CIGAR = _msf_cigar; | |
1269 _msf_output.MRNAME = "="; | |
1270 | |
1271 _msf_output.optSize = 2; | |
1272 _msf_output.optFields = _msf_optionalFields; | |
1273 | |
1274 _msf_optionalFields[0].tag = "NM"; | |
1275 _msf_optionalFields[0].type = 'i'; | |
1276 _msf_optionalFields[0].iVal = mi2[k].err;; | |
1277 | |
1278 _msf_optionalFields[1].tag = "MD"; | |
1279 _msf_optionalFields[1].type = 'Z'; | |
1280 _msf_optionalFields[1].sVal = mi2[k].md; | |
1281 | |
1282 output(_msf_output); | |
1283 } //end sampe | |
1284 } | |
1285 k++; | |
1286 } | |
1287 } | |
1288 } | |
1289 | |
1290 if (pairedEndDiscordantMode) | |
1291 { | |
1292 fclose(out); | |
1293 } | |
1294 | |
1295 | |
1296 freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize); | |
1297 freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize); | |
1298 | |
1299 for (i=0; i<_msf_openFiles; i++) | |
1300 { | |
1301 fclose(in1[i]); | |
1302 fclose(in2[i]); | |
1303 //fprintf(stdout, "%s %s \n", fname1[i], fname2[i]); | |
1304 unlink(fname1[i]); | |
1305 unlink(fname2[i]); | |
1306 } | |
1307 _msf_openFiles = 0; | |
1308 | |
1309 } | |
1310 | |
1311 /**********************************************/ | |
1312 /**********************************************/ | |
1313 /**********************************************/ | |
1314 /**********************************************/ | |
1315 float calculateScore(int index, char *seq, char *qual, int *err) | |
1316 { | |
1317 int i; | |
1318 char *ref; | |
1319 char *ver; | |
1320 | |
1321 ref = _msf_refGen + index-1; | |
1322 ver = seq; | |
1323 float score = 1; | |
1324 | |
1325 if (*err > 0 || *err == -1) | |
1326 { | |
1327 *err = 0; | |
1328 | |
1329 for (i=0; i < SEQ_LENGTH; i++) | |
1330 { | |
1331 if (*ref != *ver) | |
1332 { | |
1333 //fprintf(stdout, "%c %c %d", *ref, *ver, *err); | |
1334 (*err)++; | |
1335 score *= 0.001 + 1/pow( 10, ((qual[i]-33)/10.0) ); | |
1336 } | |
1337 ref++; | |
1338 ver++; | |
1339 } | |
1340 | |
1341 } | |
1342 return score; | |
1343 } | |
1344 | |
1345 /**********************************************/ | |
1346 void outputPairedEndDiscPP() | |
1347 { | |
1348 char genName[SEQ_LENGTH]; | |
1349 char fname1[FILE_NAME_LENGTH]; | |
1350 char fname2[FILE_NAME_LENGTH]; | |
1351 char fname3[FILE_NAME_LENGTH]; | |
1352 char fname4[FILE_NAME_LENGTH]; | |
1353 char fname5[FILE_NAME_LENGTH]; | |
1354 char fname6[FILE_NAME_LENGTH]; | |
1355 char l; | |
1356 int loc1, loc2; | |
1357 char err1, err2; | |
1358 char dir1, dir2; | |
1359 float sc1, sc2, lsc=0; | |
1360 int flag = 0; | |
1361 int rNo,lrNo = -1; | |
1362 int tmp; | |
1363 FILE *in, *in1, *in2, *out, *out1, *out2; | |
1364 | |
1365 sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput); | |
1366 sprintf(fname2, "%s__%s__oea1", mappingOutputPath, mappingOutput); | |
1367 sprintf(fname3, "%s__%s__oea2", mappingOutputPath, mappingOutput); | |
1368 sprintf(fname4, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput); | |
1369 sprintf(fname5, "%s%s_OEA1.vh", mappingOutputPath, mappingOutput); | |
1370 sprintf(fname6, "%s%s_OEA2.vh", mappingOutputPath, mappingOutput); | |
1371 | |
1372 in = fileOpen(fname1, "r"); | |
1373 in1 = fileOpen(fname2, "r"); | |
1374 in2 = fileOpen(fname3, "r"); | |
1375 out = fileOpen(fname4, "w"); | |
1376 out1 = fileOpen(fname5, "w"); | |
1377 out2 = fileOpen(fname6, "w"); | |
1378 if (in != NULL) | |
1379 { | |
1380 flag = fread(&rNo, sizeof(int), 1, in); | |
1381 } | |
1382 else | |
1383 { | |
1384 flag = 0; | |
1385 } | |
1386 | |
1387 | |
1388 while (flag) | |
1389 { | |
1390 | |
1391 tmp = fread(&l, sizeof(char), 1, in); | |
1392 tmp = fread(genName, sizeof(char), l, in); | |
1393 genName[l]='\0'; | |
1394 tmp = fread(&loc1, sizeof(int), 1, in); | |
1395 tmp = fread(&err1, sizeof(char), 1, in); | |
1396 tmp = fread(&sc1, sizeof(float), 1, in); | |
1397 tmp = fread(&loc2, sizeof(int), 1, in); | |
1398 tmp = fread(&err2, sizeof(char), 1, in); | |
1399 tmp = fread(&sc2, sizeof(float), 1, in); | |
1400 | |
1401 //if (rNo ==6615) | |
1402 // fprintf(stdout, "%s %d: %d %0.20f %d %d %0.20f\n", genName, loc1, err1, sc1, loc2, err2, sc2); | |
1403 | |
1404 if (_msf_seqList[rNo*2].hits[0] % 2 == 0 && _msf_seqHits[rNo] < DISCORDANT_CUT_OFF) | |
1405 { | |
1406 dir1 = dir2 = 'F'; | |
1407 | |
1408 if (loc1 < 0) | |
1409 { | |
1410 dir1 = 'R'; | |
1411 loc1 = -loc1; | |
1412 } | |
1413 | |
1414 if (loc2 < 0) | |
1415 { | |
1416 dir2 = 'R'; | |
1417 loc2 = -loc2; | |
1418 } | |
1419 | |
1420 if (rNo != lrNo) | |
1421 { | |
1422 int j; | |
1423 for (j=0; j<SEQ_LENGTH; j++) | |
1424 { | |
1425 lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j]; | |
1426 } | |
1427 lsc /= 2*SEQ_LENGTH; | |
1428 lsc -= 33; | |
1429 lrNo = rNo; | |
1430 } | |
1431 | |
1432 int inv = 0; | |
1433 int eve = 0; | |
1434 int dist = 0; | |
1435 char event; | |
1436 | |
1437 //fprintf(stdout, "%c %c ", dir1, dir2); | |
1438 | |
1439 if ( dir1 == dir2 ) | |
1440 { | |
1441 event = 'V'; | |
1442 //fprintf(stdout, "Inverstion \n"); | |
1443 } | |
1444 else | |
1445 { | |
1446 if (loc1 < loc2) | |
1447 { | |
1448 | |
1449 //fprintf(stdout, "< %d ", loc2-loc1-SEQ_LENGTH); | |
1450 | |
1451 if (dir1 == 'R' && dir2 == 'F') | |
1452 { | |
1453 event = 'E'; | |
1454 | |
1455 //fprintf(stdout, "Everted \n"); | |
1456 } | |
1457 else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance ) | |
1458 { | |
1459 event = 'D'; | |
1460 //fprintf(stdout, "Deletion \n"); | |
1461 } | |
1462 else | |
1463 { | |
1464 event = 'I'; | |
1465 //fprintf(stdout, "Insertion \n"); | |
1466 } | |
1467 } | |
1468 else if (loc2 < loc1) | |
1469 { | |
1470 //fprintf(stdout, "> %d ", loc1-loc2-SEQ_LENGTH); | |
1471 if (dir2 == 'R' && dir1 == 'F') | |
1472 { | |
1473 event = 'E'; | |
1474 //fprintf(stdout, "Everted \n"); | |
1475 } | |
1476 else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance ) | |
1477 { | |
1478 event = 'D'; | |
1479 //fprintf(stdout, "Deletion \n"); | |
1480 } | |
1481 else | |
1482 { | |
1483 event = 'I'; | |
1484 //fprintf(stdout, "Insertion \n"); | |
1485 } | |
1486 } | |
1487 } | |
1488 _msf_seqList[rNo*2].hits[0] = 2; | |
1489 fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n", | |
1490 _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, genName, loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2); | |
1491 } | |
1492 flag = fread(&rNo, sizeof(int), 1, in); | |
1493 | |
1494 } | |
1495 | |
1496 /* | |
1497 MappingInfoNode *lr[_msf_seqListSize/2]; | |
1498 MappingInfoNode *rr[_msf_seqListSize/2]; | |
1499 MappingInfoNode *cur, *tmpDel, *cur2; | |
1500 | |
1501 | |
1502 int ls[_msf_seqListSize/2]; | |
1503 int rs[_msf_seqListSize/2]; | |
1504 | |
1505 | |
1506 int i=0; | |
1507 | |
1508 for (i = 0; i<_msf_seqListSize/2; i++) | |
1509 { | |
1510 lr[i] = rr[i] = NULL; | |
1511 ls[i] = rs[i] = 0; | |
1512 } | |
1513 | |
1514 | |
1515 | |
1516 if (in1 != NULL) | |
1517 { | |
1518 flag = fread(&rNo, sizeof(int), 1, in1); | |
1519 } | |
1520 else | |
1521 { | |
1522 flag = 0; | |
1523 } | |
1524 | |
1525 | |
1526 while (flag) | |
1527 { | |
1528 tmp = fread(&loc1, sizeof(int), 1, in1); | |
1529 tmp = fread(&err1, sizeof(char), 1, in1); | |
1530 tmp = fread(&sc1, sizeof(float), 1, in1); | |
1531 tmp = fread(&l, sizeof(char), 1, in1); | |
1532 tmp = fread(genName, sizeof(char), l, in1); | |
1533 genName[l]='\0'; | |
1534 | |
1535 if (_msf_seqList[rNo*2].hits[0] == 0) | |
1536 { | |
1537 | |
1538 if ( ls[rNo] < DISCORDANT_CUT_OFF ) | |
1539 { | |
1540 ls[rNo]++; | |
1541 | |
1542 cur = lr[rNo]; | |
1543 | |
1544 if (cur !=NULL) | |
1545 { | |
1546 if (err1 == cur->err) | |
1547 { | |
1548 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1549 | |
1550 nr->loc = loc1; | |
1551 nr->err = err1; | |
1552 nr->score = sc1; | |
1553 nr->next = lr[rNo]; | |
1554 sprintf(nr->chr,"%s", genName); | |
1555 lr[rNo] = nr; | |
1556 } | |
1557 else if (err1 < cur->err) | |
1558 { | |
1559 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1560 | |
1561 nr->loc = loc1; | |
1562 nr->err = err1; | |
1563 nr->score = sc1; | |
1564 sprintf(nr->chr,"%s", genName); | |
1565 nr->next = NULL; | |
1566 lr[rNo] = nr; | |
1567 while (cur!=NULL) | |
1568 { | |
1569 tmpDel = cur; | |
1570 cur = cur->next; | |
1571 freeMem(tmpDel, sizeof(MappingInfoNode)); | |
1572 } | |
1573 } | |
1574 } | |
1575 else | |
1576 { | |
1577 | |
1578 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1579 | |
1580 nr->loc = loc1; | |
1581 nr->err = err1; | |
1582 nr->score = sc1; | |
1583 sprintf(nr->chr,"%s", genName); | |
1584 nr->next = NULL; | |
1585 lr[rNo] = nr; | |
1586 } | |
1587 | |
1588 if (ls[rNo] > DISCORDANT_CUT_OFF) | |
1589 { | |
1590 cur = lr[rNo]; | |
1591 while (cur!=NULL) | |
1592 { | |
1593 tmpDel = cur; | |
1594 cur = cur->next; | |
1595 freeMem(tmpDel, sizeof(MappingInfoNode)); | |
1596 } | |
1597 } | |
1598 } | |
1599 | |
1600 } | |
1601 flag = fread(&rNo, sizeof(int), 1, in1); | |
1602 | |
1603 } | |
1604 | |
1605 | |
1606 if (in2 != NULL) | |
1607 { | |
1608 flag = fread(&rNo, sizeof(int), 1, in2); | |
1609 } | |
1610 else | |
1611 { | |
1612 flag = 0; | |
1613 } | |
1614 | |
1615 | |
1616 while (flag) | |
1617 { | |
1618 tmp = fread(&loc1, sizeof(int), 1, in2); | |
1619 tmp = fread(&err1, sizeof(char), 1, in2); | |
1620 tmp = fread(&sc1, sizeof(float), 1, in2); | |
1621 tmp = fread(&l, sizeof(char), 1, in2); | |
1622 tmp = fread(genName, sizeof(char), l, in2); | |
1623 genName[l]='\0'; | |
1624 | |
1625 if (_msf_seqList[rNo*2].hits[0] == 0) | |
1626 { | |
1627 | |
1628 if ( rs[rNo] < DISCORDANT_CUT_OFF ) | |
1629 { | |
1630 rs[rNo]++; | |
1631 | |
1632 cur = rr[rNo]; | |
1633 | |
1634 if (cur !=NULL) | |
1635 { | |
1636 if (err1 == cur->err) | |
1637 { | |
1638 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1639 | |
1640 nr->loc = loc1; | |
1641 nr->err = err1; | |
1642 nr->score = sc1; | |
1643 nr->next = rr[rNo]; | |
1644 sprintf(nr->chr,"%s", genName); | |
1645 rr[rNo] = nr; | |
1646 } | |
1647 else if (err1 < cur->err) | |
1648 { | |
1649 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1650 | |
1651 nr->loc = loc1; | |
1652 nr->err = err1; | |
1653 nr->score = sc1; | |
1654 sprintf(nr->chr,"%s", genName); | |
1655 nr->next = NULL; | |
1656 rr[rNo] = nr; | |
1657 while (cur!=NULL) | |
1658 { | |
1659 tmpDel = cur; | |
1660 cur = cur->next; | |
1661 freeMem(tmpDel, sizeof(MappingInfoNode)); | |
1662 } | |
1663 } | |
1664 } | |
1665 else | |
1666 { | |
1667 | |
1668 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); | |
1669 | |
1670 nr->loc = loc1; | |
1671 nr->err = err1; | |
1672 nr->score = sc1; | |
1673 sprintf(nr->chr,"%s", genName); | |
1674 nr->next = NULL; | |
1675 rr[rNo] = nr; | |
1676 } | |
1677 | |
1678 if (rs[rNo] > DISCORDANT_CUT_OFF) | |
1679 { | |
1680 cur = rr[rNo]; | |
1681 while (cur!=NULL) | |
1682 { | |
1683 tmpDel = cur; | |
1684 cur = cur->next; | |
1685 freeMem(tmpDel, sizeof(MappingInfoNode)); | |
1686 } | |
1687 } | |
1688 } | |
1689 } | |
1690 flag = fread(&rNo, sizeof(int), 1, in2); | |
1691 | |
1692 } | |
1693 | |
1694 | |
1695 for (i=0; i<_msf_seqListSize/2; i++) | |
1696 { | |
1697 int j; | |
1698 for (j=0; j<SEQ_LENGTH; j++) | |
1699 { | |
1700 lsc += _msf_seqList[i*2].qual[j]+_msf_seqList[i*2+1].qual[j]; | |
1701 } | |
1702 lsc /= 2*SEQ_LENGTH; | |
1703 lsc -= 33; | |
1704 if (ls[i] * rs[i] < DISCORDANT_CUT_OFF && ls[i] & rs[i] > 0) | |
1705 { | |
1706 cur = lr[i]; | |
1707 while (cur != NULL) | |
1708 { | |
1709 cur2 = rr[i]; | |
1710 if (cur->loc < 0) | |
1711 { | |
1712 dir1 = 'R'; | |
1713 loc1 = -cur->loc; | |
1714 } | |
1715 else | |
1716 { | |
1717 dir1 = 'F'; | |
1718 loc1 = cur->loc; | |
1719 } | |
1720 while (cur2 != NULL) | |
1721 { | |
1722 | |
1723 if (cur2->loc < 0) | |
1724 { | |
1725 dir2 = 'R'; | |
1726 loc2 = -cur2->loc; | |
1727 } | |
1728 else | |
1729 { | |
1730 dir2 = 'F'; | |
1731 loc2 = cur2->loc; | |
1732 } | |
1733 | |
1734 fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n", | |
1735 _msf_seqList[i*2].name, cur->chr, loc1, (loc1+SEQ_LENGTH-1), dir1, cur2->chr, loc2, (loc2+SEQ_LENGTH-1), dir2, 'T', (cur->err+cur2->err), lsc, cur->score*cur2->score); | |
1736 cur2 = cur2->next; | |
1737 } | |
1738 cur = cur->next; | |
1739 } | |
1740 } | |
1741 | |
1742 }*/ | |
1743 | |
1744 | |
1745 fclose(in); | |
1746 fclose(in1); | |
1747 fclose(in2); | |
1748 fclose(out); | |
1749 fclose(out1); | |
1750 fclose(out2); | |
1751 | |
1752 unlink(fname1); | |
1753 unlink(fname2); | |
1754 unlink(fname3); | |
1755 unlink(fname5); | |
1756 unlink(fname6); | |
1757 } |