comparison mrsfast-2.3.0.2/MrsFAST.c @ 0:ec628ba33878 default tip

Uploaded source code for mrsFAST
author calkan
date Tue, 21 Feb 2012 10:39:28 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ec628ba33878
1 /*
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * Redistributions of source code must retain the above copyright notice, this list
9 * of conditions and the following disclaimer.
10 * - Redistributions in binary form must reproduce the above copyright notice, this
11 * list of conditions and the following disclaimer in the documentation and/or other
12 * materials provided with the distribution.
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be
14 * used to endorse or promote products derived from this software without specific
15 * prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * Author : Faraz Hach
32 * Email : fhach AT cs DOT sfu
33 * Last Update : 2009-01-29
34 */
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <math.h>
40 #include "Common.h"
41 #include "Reads.h"
42 #include "HashTable.h"
43 #include "Output.h"
44 #include "MrsFAST.h"
45 #include "RefGenome.h"
46
47 float calculateScore(int index, char *seq, char *qual, int *err);
48 unsigned char mrFAST = 0;
49 char *versionNumberF="0.2";
50
51 long long verificationCnt = 0;
52 long long mappingCnt = 0;
53 long long mappedSeqCnt = 0;
54 long long completedSeqCnt = 0;
55 char *mappingOutput;
56 /**********************************************/
57 char *_msf_refGen = NULL;
58 int _msf_refGenLength = 0;
59 int _msf_refGenOffset = 0;
60 char *_msf_refGenName = NULL;
61
62 int _msf_refGenBeg;
63 int _msf_refGenEnd;
64
65 IHashTable *_msf_hashTable = NULL;
66
67 int *_msf_samplingLocs;
68 int *_msf_samplingLocsEnds;
69 int _msf_samplingLocsSize;
70
71 Read *_msf_seqList;
72 int _msf_seqListSize;
73
74 ReadIndexTable *_msf_rIndex = NULL;
75 int _msf_rIndexSize;
76 int _msf_rIndexMax;
77
78 SAM _msf_output;
79
80 OPT_FIELDS *_msf_optionalFields;
81
82 char *_msf_op;
83
84 char _msf_numbers[200][3];
85 char _msf_cigar[5];
86
87 MappingInfo *_msf_mappingInfo;
88
89 int *_msf_seqHits;
90 int _msf_openFiles = 0;
91 int _msf_maxLSize=0;
92 int _msf_maxRSize=0;
93 /**********************************************/
94 int compare (const void *a, const void *b)
95 {
96 return ((Pair *)a)->hv - ((Pair *)b)->hv;
97 }
98 /**********************************************/
99 void preProcessReads()
100 {
101 int i=0;
102 int j=0;
103 int pos = 0;
104
105 _msf_rIndexMax = -1;
106
107 int tmpSize = _msf_seqListSize*_msf_samplingLocsSize*2;
108 Pair *tmp = getMem(sizeof(Pair)*tmpSize);
109
110 for (i=0; i<_msf_seqListSize; i++)
111 {
112 for (j=0; j< _msf_samplingLocsSize; j++)
113 {
114
115 tmp[pos].hv = hashVal(_msf_seqList[i].seq+_msf_samplingLocs[j]);
116 tmp[pos].seqInfo = pos;
117 pos++;
118 }
119 for (j=0; j<_msf_samplingLocsSize; j++)
120 {
121 tmp[pos].hv = hashVal(_msf_seqList[i].rseq+_msf_samplingLocs[j]);
122 tmp[pos].seqInfo = pos;
123 pos++;
124 }
125 }
126
127 qsort(tmp, tmpSize, sizeof(Pair), compare);
128
129
130 int uniq = 0;
131 int prev = -2;
132 int beg = -1;
133 int end = -1;
134
135 for (i=0; i<tmpSize; i++)
136 {
137 if (prev != tmp[i].hv)
138 {
139 uniq ++;
140 prev = tmp[i].hv;
141 }
142 }
143
144 _msf_rIndexSize = uniq;
145 _msf_rIndex = getMem(sizeof(ReadIndexTable)*_msf_rIndexSize);
146 prev = -2;
147
148 j=0;
149 beg =0;
150 while (beg < tmpSize)
151 {
152 end = beg;
153 while (end+1<tmpSize && tmp[end+1].hv==tmp[beg].hv)
154 end++;
155
156 _msf_rIndex[j].hv = tmp[beg].hv;
157 _msf_rIndex[j].seqInfo = getMem(sizeof(int)*(end-beg+2));
158 _msf_rIndex[j].seqInfo[0] = end-beg+1;
159 if ((end-beg+1) > _msf_rIndexMax)
160 _msf_rIndexMax = end-beg+1;
161
162 for (i=1; i<=_msf_rIndex[j].seqInfo[0]; i++)
163 {
164 _msf_rIndex[j].seqInfo[i]=tmp[beg+i-1].seqInfo;
165 }
166 j++;
167 beg = end+1;
168 }
169 freeMem(tmp, sizeof(Pair)*tmpSize);
170 }
171 /**********************************************/
172 void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName)
173 {
174 if (_msf_optionalFields == NULL)
175 {
176 _msf_op = getMem(SEQ_LENGTH);
177 if (pairedEndMode)
178 {
179 _msf_optionalFields = getMem(4*sizeof(OPT_FIELDS));
180 }
181 else
182 {
183 _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS));
184 }
185
186 int i;
187 for (i=0; i<200;i++)
188 {
189 sprintf(_msf_numbers[i],"%d%c",i, '\0');
190 }
191 sprintf(_msf_cigar, "%dM", SEQ_LENGTH);
192 }
193
194 if (_msf_samplingLocsEnds == NULL)
195 {
196 int i;
197 _msf_samplingLocs = samplingLocs;
198 _msf_samplingLocsSize = samplingLocsSize;
199
200 _msf_samplingLocsEnds = malloc(sizeof(int)*_msf_samplingLocsSize);
201 for (i=0; i<_msf_samplingLocsSize; i++)
202 {
203 _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1;
204 }
205
206 _msf_seqList = seqList;
207 _msf_seqListSize = seqListSize;
208
209 preProcessReads();
210 }
211 if (_msf_refGenName == NULL)
212 {
213 _msf_refGenName = getMem(SEQ_LENGTH);
214 }
215 _msf_refGen = getRefGenome();
216 _msf_refGenLength = strlen(_msf_refGen);
217 _msf_refGenOffset = getRefGenomeOffset();
218 sprintf(_msf_refGenName,"%s%c", getRefGenomeName(), '\0');
219
220 if (pairedEndMode && _msf_seqHits == NULL)
221 {
222 _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo));
223
224 int i=0;
225 for (i=0; i<seqListSize; i++)
226 {
227 //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations));
228 _msf_mappingInfo[i].next = NULL;
229 _msf_mappingInfo[i].size = 0;
230 }
231
232 _msf_seqHits = getMem((_msf_seqListSize/2) * sizeof(int));
233
234
235 for (i=0; i<_msf_seqListSize/2; i++)
236 {
237 _msf_seqHits[i] = 0;
238 }
239
240 initLoadingRefGenome(genFileName);
241 }
242
243 if (_msf_refGenOffset == 0)
244 {
245 _msf_refGenBeg = 1;
246 }
247 else
248 {
249 _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2;
250 }
251 _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1;
252
253
254 }
255 /**********************************************/
256 void finalizeFAST()
257 {
258 freeMem(_msf_seqHits, (_msf_seqListSize/2) * sizeof(int));
259 freeMem(_msf_refGenName, SEQ_LENGTH);
260 int i;
261 for (i=0; i<_msf_rIndexSize; i++)
262 {
263 freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1);
264 }
265 freeMem(_msf_rIndex, _msf_rIndexSize);
266 }
267
268 /**********************************************/
269 int verifySingleEnd(int index, char* seq, int offset)
270 {
271 int curOff = 0;
272 int i;
273
274 char *ref;
275
276 int err;
277 int errCnt =0;
278 int errCntOff = 0;
279 int NCntOff = 0;
280
281 int matchCnt = 0;
282 int pp = 0;
283
284 ref = _msf_refGen + index - 1;
285
286 verificationCnt++;
287
288 for (i = 0; i < SEQ_LENGTH; i++)
289 {
290 err = *ref != *seq;
291
292 errCnt += err;
293 if (errCnt > errThreshold)
294 {
295
296 return -1;
297 }
298
299 if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff])
300 {
301 errCntOff += err;
302 NCntOff += (*seq == 'N');
303 }
304 else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1])
305 {
306
307 if (errCntOff == 0 && NCntOff == 0 && offset > curOff)
308 {
309 return -1;
310 }
311
312 errCntOff = 0;
313 NCntOff = 0;
314 curOff++;
315
316 if ( i >= _msf_samplingLocs[curOff])
317 {
318 errCntOff += err;
319 NCntOff += (*seq == 'N');
320 }
321 }
322
323 ref++;
324 seq++;
325 }
326 return errCnt;
327 }
328
329 /**********************************************/
330 int calculateMD(int index, char *seq, int err, char **opSeq)
331 {
332 int i;
333 char *ref;
334 char *ver;
335 short matchCnt = 0;
336 char *op = *opSeq;
337 int pp = 0;
338
339 ref = _msf_refGen + index-1;
340 ver = seq;
341
342 if (err>0 || err == -1 )
343 {
344
345 err = 0;
346 for (i=0; i < SEQ_LENGTH; i++)
347 {
348 if (* ref != *ver)
349 {
350 err++;
351 if (matchCnt)
352 {
353 if (matchCnt < 10)
354 {
355 op[pp++]=_msf_numbers[matchCnt][0];
356 }
357 else if (matchCnt < 100)
358 {
359 op[pp++]=_msf_numbers[matchCnt][0];
360 op[pp++]=_msf_numbers[matchCnt][1];
361 }
362 else
363 {
364 op[pp++]=_msf_numbers[matchCnt][0];
365 op[pp++]=_msf_numbers[matchCnt][1];
366 op[pp++]=_msf_numbers[matchCnt][2];
367 }
368
369 matchCnt = 0;
370 }
371 op[pp++]=*ref;
372 }
373 else
374 {
375 matchCnt++;
376 }
377 ref++;
378 ver++;
379 }
380
381 }
382 if (err == 0)
383 {
384 matchCnt = SEQ_LENGTH;
385 }
386
387 if (matchCnt>0)
388 {
389 if (matchCnt < 10)
390 {
391 op[pp++]=_msf_numbers[matchCnt][0];
392 }
393 else if (matchCnt < 100)
394 {
395 op[pp++]=_msf_numbers[matchCnt][0];
396 op[pp++]=_msf_numbers[matchCnt][1];
397 }
398 else
399 {
400 op[pp++]=_msf_numbers[matchCnt][0];
401 op[pp++]=_msf_numbers[matchCnt][1];
402 op[pp++]=_msf_numbers[matchCnt][2];
403 }
404 }
405 op[pp]='\0';
406
407 return err;
408 }
409
410 /**********************************************/
411 void mapSingleEndSeqListBal(unsigned int *l1, int s1, unsigned int *l2, int s2, int dir)
412 {
413
414 if (s1 == 0 || s2 == 0)
415 {
416 return;
417 }
418 else if (s1 == s2 && s1 <= 50)
419 {
420
421 int j = 0;
422 int z = 0;
423 int *locs;
424 int *seqInfo;
425 char *_tmpSeq, *_tmpQual;
426 char rqual[QUAL_LENGTH+1];
427 rqual[QUAL_LENGTH]='\0';
428
429 if (dir > 0)
430 {
431 locs = (int *) l1;
432 seqInfo = (int *) l2;
433 }
434 else
435 {
436 locs = (int *) l2;
437 seqInfo = (int *) l1;
438 }
439
440
441 for (j=0; j<s2; j++)
442 {
443 int re = _msf_samplingLocsSize * 2;
444 int r = seqInfo[j]/re;
445 if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits)
446 {
447 continue;
448 }
449
450 int x = seqInfo[j] % re;
451 int o = x % _msf_samplingLocsSize;
452 char d = (x/_msf_samplingLocsSize)?1:0;
453
454
455 if (d)
456 {
457 reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH);
458 _tmpQual = rqual;
459 _tmpSeq = _msf_seqList[r].rseq;
460 }
461 else
462 {
463 _tmpQual = _msf_seqList[r].qual;
464 _tmpSeq = _msf_seqList[r].seq;
465 }
466
467
468 for (z=0; z<s1; z++)
469 {
470 int genLoc = locs[z]-_msf_samplingLocs[o];
471
472
473 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd )
474 continue;
475
476 int err = -1;
477
478
479
480 err = verifySingleEnd(genLoc, _tmpSeq, o);
481
482
483
484 if (err != -1)
485 {
486 calculateMD(genLoc, _tmpSeq, err, &_msf_op);
487 mappingCnt++;
488 _msf_seqList[r].hits[0]++;
489
490 _msf_output.QNAME = _msf_seqList[r].name;
491 _msf_output.FLAG = 16 * d;
492 _msf_output.RNAME = _msf_refGenName;
493 _msf_output.POS = genLoc + _msf_refGenOffset;
494 _msf_output.MAPQ = 255;
495 _msf_output.CIGAR = _msf_cigar;
496 _msf_output.MRNAME = "*";
497 _msf_output.MPOS = 0;
498 _msf_output.ISIZE = 0;
499 _msf_output.SEQ = _tmpSeq;
500 _msf_output.QUAL = _tmpQual;
501
502 _msf_output.optSize = 2;
503 _msf_output.optFields = _msf_optionalFields;
504
505 _msf_optionalFields[0].tag = "NM";
506 _msf_optionalFields[0].type = 'i';
507 _msf_optionalFields[0].iVal = err;
508
509 _msf_optionalFields[1].tag = "MD";
510 _msf_optionalFields[1].type = 'Z';
511 _msf_optionalFields[1].sVal = _msf_op;
512
513 output(_msf_output);
514
515
516 if (_msf_seqList[r].hits[0] == 1)
517 {
518 mappedSeqCnt++;
519 }
520
521 if ( maxHits == 0 )
522 {
523 _msf_seqList[r].hits[0] = 2;
524 }
525
526
527 if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits)
528 {
529 completedSeqCnt++;
530 break;
531 }
532 }
533
534 }
535 }
536 }
537 else
538 {
539 int tmp1=s1/2, tmp2= s2/2;
540 if (tmp1 != 0)
541 mapSingleEndSeqListBal(l1, tmp1, l2+tmp2, s2-tmp2, dir);
542 mapSingleEndSeqListBal(l2+tmp2, s2-tmp2, l1+tmp1, s1-tmp1, -dir);
543 if (tmp2 !=0)
544 mapSingleEndSeqListBal(l1+tmp1, s1-tmp1, l2, tmp2, dir);
545 if (tmp1 + tmp2 != 0)
546 mapSingleEndSeqListBal(l2, tmp2, l1, tmp1, -dir);
547 }
548 }
549
550
551 /**********************************************/
552 void mapSingleEndSeqListTOP(unsigned int *l1, int s1, unsigned int *l2, int s2)
553 {
554 if (s1 < s2)
555 {
556 mapSingleEndSeqListBal(l1, s1, l2, s1,1);
557 mapSingleEndSeqListTOP(l1, s1, l2+s1, s2-s1);
558 }
559 else if (s1 > s2)
560 {
561 mapSingleEndSeqListBal(l1, s2, l2, s2,1);
562 mapSingleEndSeqListTOP(l1+s2, s1-s2, l2, s2);
563 }
564 else
565 {
566 mapSingleEndSeqListBal(l1, s1, l2, s2,1);
567 }
568 }
569
570
571 /**********************************************/
572 void mapSingleEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2)
573 {
574 if ( s2/s1 <= 2)
575 {
576 int j = 0;
577 int z = 0;
578 int *locs = (int *) l1;
579 int *seqInfo = (int *) l2;
580 char *_tmpSeq, *_tmpQual;
581 char rqual[QUAL_LENGTH+1];
582 rqual[QUAL_LENGTH]='\0';
583
584 for (j=0; j<s2; j++)
585 {
586 int re = _msf_samplingLocsSize * 2;
587 int r = seqInfo[j]/re;
588 if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits)
589 {
590 continue;
591 }
592
593 int x = seqInfo[j] % re;
594 int o = x % _msf_samplingLocsSize;
595 char d = (x/_msf_samplingLocsSize)?1:0;
596
597
598 if (d)
599 {
600 reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH);
601 _tmpQual = rqual;
602 _tmpSeq = _msf_seqList[r].rseq;
603 }
604 else
605 {
606 _tmpQual = _msf_seqList[r].qual;
607 _tmpSeq = _msf_seqList[r].seq;
608 }
609
610
611 for (z=0; z<s1; z++)
612 {
613 int genLoc = locs[z]-_msf_samplingLocs[o];
614
615
616 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd )
617 continue;
618
619 int err = -1;
620
621
622
623 err = verifySingleEnd(genLoc, _tmpSeq, o);
624
625
626
627 if (err != -1)
628 {
629 calculateMD(genLoc, _tmpSeq, err, &_msf_op);
630 mappingCnt++;
631 _msf_seqList[r].hits[0]++;
632
633 _msf_output.QNAME = _msf_seqList[r].name;
634 _msf_output.FLAG = 16 * d;
635 _msf_output.RNAME = _msf_refGenName;
636 _msf_output.POS = genLoc + _msf_refGenOffset;
637 _msf_output.MAPQ = 255;
638 _msf_output.CIGAR = _msf_cigar;
639 _msf_output.MRNAME = "*";
640 _msf_output.MPOS = 0;
641 _msf_output.ISIZE = 0;
642 _msf_output.SEQ = _tmpSeq;
643 _msf_output.QUAL = _tmpQual;
644
645 _msf_output.optSize = 2;
646 _msf_output.optFields = _msf_optionalFields;
647
648 _msf_optionalFields[0].tag = "NM";
649 _msf_optionalFields[0].type = 'i';
650 _msf_optionalFields[0].iVal = err;
651
652 _msf_optionalFields[1].tag = "MD";
653 _msf_optionalFields[1].type = 'Z';
654 _msf_optionalFields[1].sVal = _msf_op;
655
656 output(_msf_output);
657
658
659 if (_msf_seqList[r].hits[0] == 1)
660 {
661 mappedSeqCnt++;
662 }
663
664 if ( maxHits == 0 )
665 {
666 _msf_seqList[r].hits[0] = 2;
667 }
668
669
670 if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits)
671 {
672 completedSeqCnt++;
673 break;
674 }
675 }
676
677 }
678 }
679 }
680 else if (s1 == 1)
681 {
682 // fprintf(stderr, "1");
683 int tmp = s2/2;
684 mapSingleEndSeqList(l1, s1, l2, tmp);
685 mapSingleEndSeqList(l1, s1, l2+tmp, s2-tmp);
686 }
687 else if (s2 == 1)
688 {
689 // fprintf(stderr, "2");
690 int tmp = s1/2;
691 mapSingleEndSeqList(l1, tmp, l2, s2);
692 mapSingleEndSeqList(l1+tmp, s1-tmp, l2, s2);
693 }
694 else
695 {
696 // fprintf(stderr, "3");
697 int tmp1=s1/2, tmp2= s2/2;
698 mapSingleEndSeqList(l1, tmp1, l2, tmp2);
699 mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2);
700 mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2);
701 mapSingleEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2);
702 }
703 }
704 /**********************************************/
705 int mapSingleEndSeq()
706 {
707 int i = 0;
708 unsigned int *locs = NULL;
709 unsigned int *seqInfo = NULL;
710 while ( i < _msf_rIndexSize )
711 {
712
713 locs = getCandidates (_msf_rIndex[i].hv);
714 if ( locs != NULL)
715 {
716 seqInfo = _msf_rIndex[i].seqInfo;
717 mapSingleEndSeqListTOP (locs+1, locs[0], seqInfo+1, seqInfo[0]);
718 }
719 i++;
720 }
721 return 1;
722 }
723
724
725 /**********************************************/
726 /**********************************************/
727 /**********************************************/
728 /**********************************************/
729 /**********************************************/
730 int compareOut (const void *a, const void *b)
731 {
732 FullMappingInfo *aInfo = (FullMappingInfo *)a;
733 FullMappingInfo *bInfo = (FullMappingInfo *)b;
734 return aInfo->loc - bInfo->loc;
735 }
736
737 /**********************************************/
738 void mapPairedEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2)
739 {
740 if ( s2/s1 <= 2)
741 {
742 int j = 0;
743 int z = 0;
744 int *locs = (int *) l1;
745 int *seqInfo = (int *) l2;
746 char *_tmpSeq, *_tmpQual;
747 char rqual[QUAL_LENGTH+1];
748 rqual[QUAL_LENGTH]='\0';
749
750 for (j=0; j<s2; j++)
751 {
752 int re = _msf_samplingLocsSize * 2;
753 int r = seqInfo[j]/re;
754
755 if (pairedEndDiscordantMode && (_msf_seqList[r].hits[0] == 1 || (_msf_seqHits[r/2] > DISCORDANT_CUT_OFF) ))
756 {
757 continue;
758 }
759
760 int x = seqInfo[j] % re;
761 int o = x % _msf_samplingLocsSize;
762 char d = (x/_msf_samplingLocsSize)?-1:1;
763
764
765 if (d==-1)
766 {
767 _tmpSeq = _msf_seqList[r].rseq;
768 }
769 else
770 {
771 _tmpSeq = _msf_seqList[r].seq;
772 }
773
774
775 for (z=0; z<s1; z++)
776 {
777 int genLoc = locs[z]-_msf_samplingLocs[o];
778
779
780 if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd )
781 continue;
782
783 int err = -1;
784
785
786
787 err = verifySingleEnd(genLoc, _tmpSeq, o);
788
789
790 if (err != -1)
791 {
792 MappingLocations *parent = NULL;
793 MappingLocations *child = _msf_mappingInfo[r].next;
794
795 genLoc+= _msf_refGenOffset;
796
797 int i = 0;
798 for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++)
799 {
800 parent = child;
801 child = child->next;
802 }
803
804 if (child==NULL)
805 {
806 MappingLocations *tmp = getMem(sizeof(MappingLocations));
807 tmp->next = NULL;
808 tmp->loc[0]=genLoc * d;
809 if (parent == NULL)
810 _msf_mappingInfo[r].next = tmp;
811 else
812 parent->next = tmp;
813 }
814 else
815 {
816 child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d;
817 }
818
819
820
821 _msf_mappingInfo[r].size++;
822
823 }
824
825 }
826 }
827 }
828 else if (s1 == 1)
829 {
830 int tmp = s2/2;
831 mapPairedEndSeqList(l1, s1, l2, tmp);
832 mapPairedEndSeqList(l1, s1, l2+tmp, s2-tmp);
833 }
834 else if (s2 == 1)
835 {
836 int tmp = s1/2;
837 mapPairedEndSeqList(l1, tmp, l2, s2);
838 mapPairedEndSeqList(l1+tmp, s1-tmp, l2, s2);
839 }
840 else
841 {
842 int tmp1=s1/2, tmp2= s2/2;
843 mapPairedEndSeqList(l1, tmp1, l2, tmp2);
844 mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2);
845 mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2);
846 mapPairedEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2);
847 }
848 }
849
850 /**********************************************/
851 int mapPairedEndSeq()
852 {
853 int i = 0;
854 unsigned int *locs = NULL;
855 unsigned int *seqInfo = NULL;
856 while ( i < _msf_rIndexSize )
857 {
858 locs = getCandidates (_msf_rIndex[i].hv);
859 if ( locs != NULL)
860 {
861 seqInfo = _msf_rIndex[i].seqInfo;
862 mapPairedEndSeqList(locs+1, locs[0], seqInfo+1, seqInfo[0]);
863
864 }
865 i++;
866 }
867
868
869 char fname1[FILE_NAME_LENGTH];
870 char fname2[FILE_NAME_LENGTH];
871 MappingLocations *cur, *tmp;
872 int tmpOut;
873 int j;
874 int lmax=0, rmax=0;
875
876 sprintf(fname1, "%s__%s__%d__1",mappingOutputPath, mappingOutput, _msf_openFiles);
877 sprintf(fname2, "%s__%s__%d__2",mappingOutputPath, mappingOutput, _msf_openFiles);
878
879 FILE* out;
880 FILE* out1 = fileOpen(fname1, "w");
881 FILE* out2 = fileOpen(fname2, "w");
882
883 _msf_openFiles++;
884
885 for (i=0; i<_msf_seqListSize; i++)
886 {
887
888 if (i%2==0)
889 {
890 out = out1;
891
892 if (lmax < _msf_mappingInfo[i].size)
893 {
894 lmax = _msf_mappingInfo[i].size;
895 }
896 }
897 else
898 {
899 out = out2;
900 if (rmax < _msf_mappingInfo[i].size)
901 {
902 rmax = _msf_mappingInfo[i].size;
903 }
904 }
905
906 tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out);
907 if (_msf_mappingInfo[i].size > 0)
908 {
909 cur = _msf_mappingInfo[i].next;
910 for (j=0; j < _msf_mappingInfo[i].size; j++)
911 {
912 if ( j>0 && j%MAP_CHUNKS==0)
913 {
914 cur = cur->next;
915 }
916 tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out);
917 }
918 _msf_mappingInfo[i].size = 0;
919 // _msf_mappingInfo[i].next = NULL;
920 }
921 }
922
923 _msf_maxLSize += lmax;
924 _msf_maxRSize += rmax;
925
926 fclose(out1);
927 fclose(out2);
928
929 //fprintf(stdout, "%d %d\n", _msf_maxLSize, _msf_maxRSize);
930
931 }
932
933 /**********************************************/
934 void outputPairedEnd()
935 {
936
937 char *curGen;
938 char *curGenName;
939 int tmpOut;
940
941 loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut);
942
943 FILE* in1[_msf_openFiles];
944 FILE* in2[_msf_openFiles];
945
946 char fname1[_msf_openFiles][FILE_NAME_LENGTH];
947 char fname2[_msf_openFiles][FILE_NAME_LENGTH];
948
949 // discordant
950 FILE *out, *out1, *out2;
951
952 char fname3[FILE_NAME_LENGTH];
953 char fname4[FILE_NAME_LENGTH];
954 char fname5[FILE_NAME_LENGTH];
955
956 if (pairedEndDiscordantMode)
957 {
958 sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput);
959 sprintf(fname4, "%s__%s__oea1", mappingOutputPath, mappingOutput);
960 sprintf(fname5, "%s__%s__oea2", mappingOutputPath, mappingOutput);
961 out = fileOpen(fname3, "a");
962 out1 = fileOpen(fname4, "a");
963 out2 = fileOpen(fname5, "a");
964 }
965
966
967
968 int i;
969
970 FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize);
971 FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize);
972
973
974 for (i=0; i<_msf_openFiles; i++)
975 {
976 sprintf(fname1[i], "%s__%s__%d__1", mappingOutputPath, mappingOutput, i);
977 sprintf(fname2[i], "%s__%s__%d__2", mappingOutputPath, mappingOutput, i);
978 in1[i] = fileOpen(fname1[i], "r");
979 in2[i] = fileOpen(fname2[i], "r");
980 }
981
982
983 int size;
984 int j, k;
985 int size1, size2;
986
987 for (i=0; i<_msf_seqListSize/2; i++)
988 {
989 size1 = size2 = 0;
990 for (j=0; j<_msf_openFiles; j++)
991 {
992 tmpOut = fread(&size, sizeof(int), 1, in1[j]);
993 if ( size > 0 )
994 {
995 for (k=0; k<size; k++)
996 {
997
998 mi1[size1+k].dir = 1;
999 tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]);
1000 if (mi1[size1+k].loc<1)
1001 {
1002 mi1[size1+k].loc *= -1;
1003 mi1[size1+k].dir = -1;
1004 }
1005 }
1006 qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut);
1007 size1+=size;
1008 }
1009 }
1010
1011 for (j=0; j<_msf_openFiles; j++)
1012 {
1013 tmpOut = fread(&size, sizeof(int), 1, in2[j]);
1014 if ( size > 0 )
1015 {
1016 for (k=0; k<size; k++)
1017 {
1018
1019 mi2[size2+k].dir = 1;
1020 tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]);
1021
1022 if (mi2[size2+k].loc<1)
1023 {
1024 mi2[size2+k].loc *= -1;
1025 mi2[size2+k].dir = -1;
1026 }
1027 }
1028 qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut);
1029 size2+=size;
1030 }
1031 }
1032
1033 //if (i == 6615)
1034 // fprintf(stdout, "%d: %s %d %d ",i, _msf_seqList[i*2].name, size1, size2);
1035
1036 int lm, ll, rl, rm;
1037 int pos = 0;
1038
1039 if (pairedEndDiscordantMode)
1040 {
1041
1042 for (j=0; j<size1; j++)
1043 {
1044 lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1;
1045 ll = mi1[j].loc - minPairEndedDiscordantDistance + 1;
1046 rl = mi1[j].loc + minPairEndedDiscordantDistance - 1;
1047 rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1;
1048
1049 while (pos<size2 && mi2[pos].loc < lm)
1050 {
1051 pos++;
1052 }
1053
1054 k = pos;
1055 while (k<size2 && mi2[k].loc<=rm)
1056 {
1057 if ( mi2[k].loc <= ll || mi2[k].loc >= rl)
1058 {
1059 if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) ||
1060 (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) )
1061 {
1062 _msf_seqList[i*2].hits[0]=1;
1063 _msf_seqList[i*2+1].hits[0]=1;
1064 size1=0;
1065 size2=0;
1066 break;
1067 }
1068 }
1069 k++;
1070 }
1071 }
1072
1073 _msf_seqHits[i]+= size1*size2;
1074 if (_msf_seqHits[i]> DISCORDANT_CUT_OFF)
1075 {
1076 _msf_seqList[i*2].hits[0]=1;
1077 _msf_seqList[i*2+1].hits[0]=1;
1078 size1=0;
1079 size2=0;
1080 }
1081 }
1082 //if (i == 6615)
1083 // fprintf(stdout, "%d %d\n", size1, size2);
1084
1085 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2;
1086 char rqual1[QUAL_LENGTH+1], rqual2[QUAL_LENGTH+1];
1087 rqual1[QUAL_LENGTH] = rqual2[QUAL_LENGTH] = '\0';
1088 seq1 = _msf_seqList[i*2].seq;
1089 rseq1 = _msf_seqList[i*2].rseq;
1090 qual1 = _msf_seqList[i*2].qual;
1091 reverse(_msf_seqList[i*2].qual, rqual1, QUAL_LENGTH);
1092
1093 seq2 = _msf_seqList[i*2+1].seq;
1094 rseq2 = _msf_seqList[i*2+1].rseq;
1095 qual2 = _msf_seqList[i*2+1].qual;
1096 reverse(_msf_seqList[i*2+1].qual, rqual2, QUAL_LENGTH);
1097
1098
1099 if (pairedEndDiscordantMode)
1100 {
1101 for (k=0; k<size1; k++)
1102 {
1103 int tm = -1;
1104 mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, &tm);
1105 mi1[k].err = tm;
1106 }
1107
1108 for (k=0; k<size2; k++)
1109 {
1110 int tm = -1;
1111 mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, &tm);
1112 mi2[k].err = tm;
1113 }
1114
1115 }
1116 else
1117 {
1118 for (k=0; k<size1; k++)
1119 {
1120 mi1[k].err = calculateMD(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, -1, &_msf_op);
1121 sprintf(mi1[k].md, "%s", _msf_op);
1122 }
1123
1124 for (k=0; k<size2; k++)
1125 {
1126 mi2[k].err = calculateMD(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, -1, &_msf_op);
1127 sprintf(mi2[k].md, "%s", _msf_op);
1128 }
1129 }
1130 pos = 0;
1131
1132 for (j=0; j<size1; j++)
1133 {
1134 lm = mi1[j].loc - maxPairEndedDistance + 1;
1135 ll = mi1[j].loc - minPairEndedDistance + 1;
1136 rl = mi1[j].loc + minPairEndedDistance - 1;
1137 rm = mi1[j].loc + maxPairEndedDistance - 1;
1138
1139 //fprintf(stdout, "%d %d %d %d %d\n",lm, ll,mi1[j].loc ,rl, rm);
1140
1141 while (pos<size2 && mi2[pos].loc < lm)
1142 {
1143 pos++;
1144 }
1145
1146 //fprintf(stdout, "POS: %d %d \n", pos, mi2[pos].loc);
1147
1148 k = pos;
1149 while (k<size2 && mi2[k].loc <= rm)
1150 {
1151 if (mi2[k].loc <= ll || mi2[k].loc >= rl)
1152 {
1153 if (pairedEndDiscordantMode)
1154 {
1155 int tmp;
1156 int rNo = i;
1157 int loc = mi1[j].loc*mi1[j].dir;
1158 int err = mi1[j].err;
1159 float sc = mi1[j].score;
1160
1161 char l = strlen(_msf_refGenName);
1162
1163 tmp = fwrite(&rNo, sizeof(int), 1, out);
1164
1165 tmp = fwrite(&l, sizeof(char), 1, out);
1166 tmp = fwrite(_msf_refGenName, sizeof(char), l, out);
1167
1168 tmp = fwrite(&loc, sizeof(int), 1, out);
1169 tmp = fwrite(&err, sizeof(char), 1, out);
1170 tmp = fwrite(&sc, sizeof(float), 1, out);
1171
1172
1173 loc = mi2[k].loc*mi2[k].dir;
1174 err = mi2[k].err;
1175 sc = mi2[k].score;
1176
1177 tmp = fwrite(&loc, sizeof(int), 1, out);
1178 tmp = fwrite(&err, sizeof(char), 1, out);
1179 tmp = fwrite(&sc, sizeof(float), 1, out);
1180 } // end discordant
1181 else
1182 { //start sampe
1183 char *seq;
1184 char *qual;
1185 char d1;
1186 char d2;
1187 int isize;
1188 int proper=0;
1189 // ISIZE CALCULATION
1190 // The distance between outer edges
1191 isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-1;
1192 if (mi1[j].loc - mi2[k].loc > 0)
1193 {
1194 isize *= -1;
1195 }
1196
1197 d1 = (mi1[j].dir == -1)?1:0;
1198 d2 = (mi2[k].dir == -1)?1:0;
1199
1200 if ( d1 )
1201 {
1202 seq = rseq1;
1203 qual = rqual1;
1204 }
1205 else
1206 {
1207 seq = seq1;
1208 qual = qual1;
1209 }
1210
1211 if ( (mi1[j].loc < mi2[k].loc && !d1 && d2) ||
1212 (mi1[j].loc > mi2[k].loc && d1 && !d2) )
1213 {
1214 proper = 2;
1215 }
1216 else
1217 {
1218 proper = 0;
1219 }
1220
1221
1222 _msf_output.POS = mi1[j].loc;
1223 _msf_output.MPOS = mi2[k].loc;
1224 _msf_output.FLAG = 1+proper+16*d1+32*d2+64;
1225 _msf_output.ISIZE = isize;
1226 _msf_output.SEQ = seq,
1227 _msf_output.QUAL = qual;
1228 _msf_output.QNAME = _msf_seqList[i*2].name;
1229 _msf_output.RNAME = _msf_refGenName;
1230 _msf_output.MAPQ = 255;
1231 _msf_output.CIGAR = _msf_cigar;
1232 _msf_output.MRNAME = "=";
1233
1234 _msf_output.optSize = 2;
1235 _msf_output.optFields = _msf_optionalFields;
1236
1237 _msf_optionalFields[0].tag = "NM";
1238 _msf_optionalFields[0].type = 'i';
1239 _msf_optionalFields[0].iVal = mi1[j].err;
1240
1241 _msf_optionalFields[1].tag = "MD";
1242 _msf_optionalFields[1].type = 'Z';
1243 _msf_optionalFields[1].sVal = mi1[j].md;
1244
1245
1246 output(_msf_output);
1247
1248 if ( d2 )
1249 {
1250 seq = rseq2;
1251 qual = rqual2;
1252 }
1253 else
1254 {
1255 seq = seq2;
1256 qual = qual2;
1257 }
1258
1259 _msf_output.POS = mi2[k].loc;
1260 _msf_output.MPOS = mi1[j].loc;
1261 _msf_output.FLAG = 1+proper+16*d2+32*d1+128;
1262 _msf_output.ISIZE = -isize;
1263 _msf_output.SEQ = seq,
1264 _msf_output.QUAL = qual;
1265 _msf_output.QNAME = _msf_seqList[i*2].name;
1266 _msf_output.RNAME = _msf_refGenName;
1267 _msf_output.MAPQ = 255;
1268 _msf_output.CIGAR = _msf_cigar;
1269 _msf_output.MRNAME = "=";
1270
1271 _msf_output.optSize = 2;
1272 _msf_output.optFields = _msf_optionalFields;
1273
1274 _msf_optionalFields[0].tag = "NM";
1275 _msf_optionalFields[0].type = 'i';
1276 _msf_optionalFields[0].iVal = mi2[k].err;;
1277
1278 _msf_optionalFields[1].tag = "MD";
1279 _msf_optionalFields[1].type = 'Z';
1280 _msf_optionalFields[1].sVal = mi2[k].md;
1281
1282 output(_msf_output);
1283 } //end sampe
1284 }
1285 k++;
1286 }
1287 }
1288 }
1289
1290 if (pairedEndDiscordantMode)
1291 {
1292 fclose(out);
1293 }
1294
1295
1296 freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize);
1297 freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize);
1298
1299 for (i=0; i<_msf_openFiles; i++)
1300 {
1301 fclose(in1[i]);
1302 fclose(in2[i]);
1303 //fprintf(stdout, "%s %s \n", fname1[i], fname2[i]);
1304 unlink(fname1[i]);
1305 unlink(fname2[i]);
1306 }
1307 _msf_openFiles = 0;
1308
1309 }
1310
1311 /**********************************************/
1312 /**********************************************/
1313 /**********************************************/
1314 /**********************************************/
1315 float calculateScore(int index, char *seq, char *qual, int *err)
1316 {
1317 int i;
1318 char *ref;
1319 char *ver;
1320
1321 ref = _msf_refGen + index-1;
1322 ver = seq;
1323 float score = 1;
1324
1325 if (*err > 0 || *err == -1)
1326 {
1327 *err = 0;
1328
1329 for (i=0; i < SEQ_LENGTH; i++)
1330 {
1331 if (*ref != *ver)
1332 {
1333 //fprintf(stdout, "%c %c %d", *ref, *ver, *err);
1334 (*err)++;
1335 score *= 0.001 + 1/pow( 10, ((qual[i]-33)/10.0) );
1336 }
1337 ref++;
1338 ver++;
1339 }
1340
1341 }
1342 return score;
1343 }
1344
1345 /**********************************************/
1346 void outputPairedEndDiscPP()
1347 {
1348 char genName[SEQ_LENGTH];
1349 char fname1[FILE_NAME_LENGTH];
1350 char fname2[FILE_NAME_LENGTH];
1351 char fname3[FILE_NAME_LENGTH];
1352 char fname4[FILE_NAME_LENGTH];
1353 char fname5[FILE_NAME_LENGTH];
1354 char fname6[FILE_NAME_LENGTH];
1355 char l;
1356 int loc1, loc2;
1357 char err1, err2;
1358 char dir1, dir2;
1359 float sc1, sc2, lsc=0;
1360 int flag = 0;
1361 int rNo,lrNo = -1;
1362 int tmp;
1363 FILE *in, *in1, *in2, *out, *out1, *out2;
1364
1365 sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput);
1366 sprintf(fname2, "%s__%s__oea1", mappingOutputPath, mappingOutput);
1367 sprintf(fname3, "%s__%s__oea2", mappingOutputPath, mappingOutput);
1368 sprintf(fname4, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput);
1369 sprintf(fname5, "%s%s_OEA1.vh", mappingOutputPath, mappingOutput);
1370 sprintf(fname6, "%s%s_OEA2.vh", mappingOutputPath, mappingOutput);
1371
1372 in = fileOpen(fname1, "r");
1373 in1 = fileOpen(fname2, "r");
1374 in2 = fileOpen(fname3, "r");
1375 out = fileOpen(fname4, "w");
1376 out1 = fileOpen(fname5, "w");
1377 out2 = fileOpen(fname6, "w");
1378 if (in != NULL)
1379 {
1380 flag = fread(&rNo, sizeof(int), 1, in);
1381 }
1382 else
1383 {
1384 flag = 0;
1385 }
1386
1387
1388 while (flag)
1389 {
1390
1391 tmp = fread(&l, sizeof(char), 1, in);
1392 tmp = fread(genName, sizeof(char), l, in);
1393 genName[l]='\0';
1394 tmp = fread(&loc1, sizeof(int), 1, in);
1395 tmp = fread(&err1, sizeof(char), 1, in);
1396 tmp = fread(&sc1, sizeof(float), 1, in);
1397 tmp = fread(&loc2, sizeof(int), 1, in);
1398 tmp = fread(&err2, sizeof(char), 1, in);
1399 tmp = fread(&sc2, sizeof(float), 1, in);
1400
1401 //if (rNo ==6615)
1402 // fprintf(stdout, "%s %d: %d %0.20f %d %d %0.20f\n", genName, loc1, err1, sc1, loc2, err2, sc2);
1403
1404 if (_msf_seqList[rNo*2].hits[0] % 2 == 0 && _msf_seqHits[rNo] < DISCORDANT_CUT_OFF)
1405 {
1406 dir1 = dir2 = 'F';
1407
1408 if (loc1 < 0)
1409 {
1410 dir1 = 'R';
1411 loc1 = -loc1;
1412 }
1413
1414 if (loc2 < 0)
1415 {
1416 dir2 = 'R';
1417 loc2 = -loc2;
1418 }
1419
1420 if (rNo != lrNo)
1421 {
1422 int j;
1423 for (j=0; j<SEQ_LENGTH; j++)
1424 {
1425 lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j];
1426 }
1427 lsc /= 2*SEQ_LENGTH;
1428 lsc -= 33;
1429 lrNo = rNo;
1430 }
1431
1432 int inv = 0;
1433 int eve = 0;
1434 int dist = 0;
1435 char event;
1436
1437 //fprintf(stdout, "%c %c ", dir1, dir2);
1438
1439 if ( dir1 == dir2 )
1440 {
1441 event = 'V';
1442 //fprintf(stdout, "Inverstion \n");
1443 }
1444 else
1445 {
1446 if (loc1 < loc2)
1447 {
1448
1449 //fprintf(stdout, "< %d ", loc2-loc1-SEQ_LENGTH);
1450
1451 if (dir1 == 'R' && dir2 == 'F')
1452 {
1453 event = 'E';
1454
1455 //fprintf(stdout, "Everted \n");
1456 }
1457 else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance )
1458 {
1459 event = 'D';
1460 //fprintf(stdout, "Deletion \n");
1461 }
1462 else
1463 {
1464 event = 'I';
1465 //fprintf(stdout, "Insertion \n");
1466 }
1467 }
1468 else if (loc2 < loc1)
1469 {
1470 //fprintf(stdout, "> %d ", loc1-loc2-SEQ_LENGTH);
1471 if (dir2 == 'R' && dir1 == 'F')
1472 {
1473 event = 'E';
1474 //fprintf(stdout, "Everted \n");
1475 }
1476 else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance )
1477 {
1478 event = 'D';
1479 //fprintf(stdout, "Deletion \n");
1480 }
1481 else
1482 {
1483 event = 'I';
1484 //fprintf(stdout, "Insertion \n");
1485 }
1486 }
1487 }
1488 _msf_seqList[rNo*2].hits[0] = 2;
1489 fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n",
1490 _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, genName, loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2);
1491 }
1492 flag = fread(&rNo, sizeof(int), 1, in);
1493
1494 }
1495
1496 /*
1497 MappingInfoNode *lr[_msf_seqListSize/2];
1498 MappingInfoNode *rr[_msf_seqListSize/2];
1499 MappingInfoNode *cur, *tmpDel, *cur2;
1500
1501
1502 int ls[_msf_seqListSize/2];
1503 int rs[_msf_seqListSize/2];
1504
1505
1506 int i=0;
1507
1508 for (i = 0; i<_msf_seqListSize/2; i++)
1509 {
1510 lr[i] = rr[i] = NULL;
1511 ls[i] = rs[i] = 0;
1512 }
1513
1514
1515
1516 if (in1 != NULL)
1517 {
1518 flag = fread(&rNo, sizeof(int), 1, in1);
1519 }
1520 else
1521 {
1522 flag = 0;
1523 }
1524
1525
1526 while (flag)
1527 {
1528 tmp = fread(&loc1, sizeof(int), 1, in1);
1529 tmp = fread(&err1, sizeof(char), 1, in1);
1530 tmp = fread(&sc1, sizeof(float), 1, in1);
1531 tmp = fread(&l, sizeof(char), 1, in1);
1532 tmp = fread(genName, sizeof(char), l, in1);
1533 genName[l]='\0';
1534
1535 if (_msf_seqList[rNo*2].hits[0] == 0)
1536 {
1537
1538 if ( ls[rNo] < DISCORDANT_CUT_OFF )
1539 {
1540 ls[rNo]++;
1541
1542 cur = lr[rNo];
1543
1544 if (cur !=NULL)
1545 {
1546 if (err1 == cur->err)
1547 {
1548 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1549
1550 nr->loc = loc1;
1551 nr->err = err1;
1552 nr->score = sc1;
1553 nr->next = lr[rNo];
1554 sprintf(nr->chr,"%s", genName);
1555 lr[rNo] = nr;
1556 }
1557 else if (err1 < cur->err)
1558 {
1559 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1560
1561 nr->loc = loc1;
1562 nr->err = err1;
1563 nr->score = sc1;
1564 sprintf(nr->chr,"%s", genName);
1565 nr->next = NULL;
1566 lr[rNo] = nr;
1567 while (cur!=NULL)
1568 {
1569 tmpDel = cur;
1570 cur = cur->next;
1571 freeMem(tmpDel, sizeof(MappingInfoNode));
1572 }
1573 }
1574 }
1575 else
1576 {
1577
1578 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1579
1580 nr->loc = loc1;
1581 nr->err = err1;
1582 nr->score = sc1;
1583 sprintf(nr->chr,"%s", genName);
1584 nr->next = NULL;
1585 lr[rNo] = nr;
1586 }
1587
1588 if (ls[rNo] > DISCORDANT_CUT_OFF)
1589 {
1590 cur = lr[rNo];
1591 while (cur!=NULL)
1592 {
1593 tmpDel = cur;
1594 cur = cur->next;
1595 freeMem(tmpDel, sizeof(MappingInfoNode));
1596 }
1597 }
1598 }
1599
1600 }
1601 flag = fread(&rNo, sizeof(int), 1, in1);
1602
1603 }
1604
1605
1606 if (in2 != NULL)
1607 {
1608 flag = fread(&rNo, sizeof(int), 1, in2);
1609 }
1610 else
1611 {
1612 flag = 0;
1613 }
1614
1615
1616 while (flag)
1617 {
1618 tmp = fread(&loc1, sizeof(int), 1, in2);
1619 tmp = fread(&err1, sizeof(char), 1, in2);
1620 tmp = fread(&sc1, sizeof(float), 1, in2);
1621 tmp = fread(&l, sizeof(char), 1, in2);
1622 tmp = fread(genName, sizeof(char), l, in2);
1623 genName[l]='\0';
1624
1625 if (_msf_seqList[rNo*2].hits[0] == 0)
1626 {
1627
1628 if ( rs[rNo] < DISCORDANT_CUT_OFF )
1629 {
1630 rs[rNo]++;
1631
1632 cur = rr[rNo];
1633
1634 if (cur !=NULL)
1635 {
1636 if (err1 == cur->err)
1637 {
1638 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1639
1640 nr->loc = loc1;
1641 nr->err = err1;
1642 nr->score = sc1;
1643 nr->next = rr[rNo];
1644 sprintf(nr->chr,"%s", genName);
1645 rr[rNo] = nr;
1646 }
1647 else if (err1 < cur->err)
1648 {
1649 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1650
1651 nr->loc = loc1;
1652 nr->err = err1;
1653 nr->score = sc1;
1654 sprintf(nr->chr,"%s", genName);
1655 nr->next = NULL;
1656 rr[rNo] = nr;
1657 while (cur!=NULL)
1658 {
1659 tmpDel = cur;
1660 cur = cur->next;
1661 freeMem(tmpDel, sizeof(MappingInfoNode));
1662 }
1663 }
1664 }
1665 else
1666 {
1667
1668 MappingInfoNode *nr = getMem(sizeof(MappingInfoNode));
1669
1670 nr->loc = loc1;
1671 nr->err = err1;
1672 nr->score = sc1;
1673 sprintf(nr->chr,"%s", genName);
1674 nr->next = NULL;
1675 rr[rNo] = nr;
1676 }
1677
1678 if (rs[rNo] > DISCORDANT_CUT_OFF)
1679 {
1680 cur = rr[rNo];
1681 while (cur!=NULL)
1682 {
1683 tmpDel = cur;
1684 cur = cur->next;
1685 freeMem(tmpDel, sizeof(MappingInfoNode));
1686 }
1687 }
1688 }
1689 }
1690 flag = fread(&rNo, sizeof(int), 1, in2);
1691
1692 }
1693
1694
1695 for (i=0; i<_msf_seqListSize/2; i++)
1696 {
1697 int j;
1698 for (j=0; j<SEQ_LENGTH; j++)
1699 {
1700 lsc += _msf_seqList[i*2].qual[j]+_msf_seqList[i*2+1].qual[j];
1701 }
1702 lsc /= 2*SEQ_LENGTH;
1703 lsc -= 33;
1704 if (ls[i] * rs[i] < DISCORDANT_CUT_OFF && ls[i] & rs[i] > 0)
1705 {
1706 cur = lr[i];
1707 while (cur != NULL)
1708 {
1709 cur2 = rr[i];
1710 if (cur->loc < 0)
1711 {
1712 dir1 = 'R';
1713 loc1 = -cur->loc;
1714 }
1715 else
1716 {
1717 dir1 = 'F';
1718 loc1 = cur->loc;
1719 }
1720 while (cur2 != NULL)
1721 {
1722
1723 if (cur2->loc < 0)
1724 {
1725 dir2 = 'R';
1726 loc2 = -cur2->loc;
1727 }
1728 else
1729 {
1730 dir2 = 'F';
1731 loc2 = cur2->loc;
1732 }
1733
1734 fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n",
1735 _msf_seqList[i*2].name, cur->chr, loc1, (loc1+SEQ_LENGTH-1), dir1, cur2->chr, loc2, (loc2+SEQ_LENGTH-1), dir2, 'T', (cur->err+cur2->err), lsc, cur->score*cur2->score);
1736 cur2 = cur2->next;
1737 }
1738 cur = cur->next;
1739 }
1740 }
1741
1742 }*/
1743
1744
1745 fclose(in);
1746 fclose(in1);
1747 fclose(in2);
1748 fclose(out);
1749 fclose(out1);
1750 fclose(out2);
1751
1752 unlink(fname1);
1753 unlink(fname2);
1754 unlink(fname3);
1755 unlink(fname5);
1756 unlink(fname6);
1757 }