comparison mrfast-2.1.0.4/MrFAST.c @ 0:7b3dc85dc7fd

Uploaded mrfast source tarball
author calkan
date Tue, 21 Feb 2012 10:29:47 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7b3dc85dc7fd
1 /*
2 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * Redistributions of source code must retain the above copyright notice, this list
9 * of conditions and the following disclaimer.
10 * - Redistributions in binary form must reproduce the above copyright notice, this
11 * list of conditions and the following disclaimer in the documentation and/or other
12 * materials provided with the distribution.
13 * - Neither the names of the University of Washington, Simon Fraser University,
14 * nor the names of its contributors may be
15 * used to endorse or promote products derived from this software without specific
16 * prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 Authors:
33 Farhad Hormozdiari
34 Faraz Hach
35 Can Alkan
36 Emails:
37 farhadh AT uw DOT edu
38 fhach AT cs DOT sfu DOT ca
39 calkan AT uw DOT edu
40 */
41
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <math.h>
46 #include <dirent.h>
47 #include <xmmintrin.h>
48 #include <emmintrin.h>
49 #include <mmintrin.h>
50
51
52 #include "Common.h"
53 #include "Reads.h"
54 #include "HashTable.h"
55 #include "Output.h"
56 #include "MrFAST.h"
57 #include "RefGenome.h"
58
59
60 #define min(a,b) ((a)>(b)?(b):(a))
61 #define min3(a,b,c) ((a)>(b)?(b>c?c:b):(a>c?c:a))
62 #define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4))))
63
64 #define MAX_REF_SIZE 18
65
66
67 float calculateScore(int index, char *seq, char *qual, char *md);
68 unsigned char mrFAST = 1;
69 char *versionNumberF="0.4";
70
71 long long verificationCnt = 0;
72 long long mappingCnt = 0;
73 long long mappedSeqCnt = 0;
74 long long completedSeqCnt = 0;
75 char *mappingOutput;
76 /**********************************************/
77 char *_msf_refGen = NULL;
78 int _msf_refGenLength = 0;
79 int _msf_refGenOffset = 0;
80 char *_msf_refGenName = NULL;
81
82 int _msf_refGenBeg;
83 int _msf_refGenEnd;
84
85 IHashTable *_msf_hashTable = NULL;
86
87 int *_msf_samplingLocs;
88 int *_msf_samplingLocsEnds;
89 int _msf_samplingLocsSize;
90
91 Read *_msf_seqList;
92 int _msf_seqListSize;
93
94 Pair *_msf_sort_seqList = NULL;
95 int *_msf_map_sort_seqList;
96
97 ReadIndexTable *_msf_rIndex = NULL;
98 int _msf_rIndexSize;
99 int _msf_rIndexMax;
100
101 SAM _msf_output;
102
103 OPT_FIELDS *_msf_optionalFields;
104
105 char *_msf_op;
106
107 int *_msf_verifiedLocs = NULL;
108
109 char _msf_numbers[200][3];
110 char _msf_cigar[5];
111
112 MappingInfo *_msf_mappingInfo;
113
114 int *_msf_seqHits;
115 int _msf_openFiles = 0;
116 int _msf_maxLSize=0;
117 int _msf_maxRSize=0;
118
119 BestFullMappingInfo *bestHitMappingInfo;
120
121 /*************************/
122 int _msf_maxFile=0;
123 char _msf_fileName[4000][200][2][FILE_NAME_LENGTH];
124 int _msf_fileCount[4000];
125
126 char *_msf_readHasConcordantMapping; //boolean if a read has concordant mapping :D
127
128 int *_msf_oeaMapping;
129 int *_msf_discordantMapping;
130
131 FILE *bestConcordantFILE;
132 FILE *bestDiscordantFILE;
133
134 int counter = 0;
135
136 int scoreF[200][200];
137 int scoreB[200][200];
138
139 int score[200][200];
140 int direction1[200][200];
141 int direction2[200][200];
142
143 __m128i MASK;
144
145 int lookUpTable[15625][15625];
146
147 /**************************************************Methods***************************************************/
148 int smallEditDistanceF(char *a, int lena, char *b, int lenb)
149 {
150 int matrix[20][20];
151 int i = 0;
152 int j = 0;
153
154 for(i = 0; i <= lena; i++)
155 {
156 matrix[0][i] = i;
157 }
158
159 for(i = 0; i <= lenb; i++)
160 {
161 matrix[i][0] = i;
162 }
163
164
165 for(i = 1; i <= lenb; i++)
166 {
167 for(j = 1; j <= lena; j++)
168 {
169 matrix[i][j] = min3(matrix[i-1][j-1]+ (a[j-1] != b[i-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1);
170 }
171 }
172 return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]);
173 }
174
175 int smallEditDistanceB(char *a, int lena, char *b, int lenb)
176 {
177 int matrix[20][20];
178 int i = 0;
179 int j = 0;
180
181 for(i = 0; i <= lena; i++)
182 {
183 matrix[0][i] = i;
184 }
185
186 for(i = 0; i <= lenb; i++)
187 {
188 matrix[i][0] = i;
189 }
190
191
192 for(i = 1; i <= lenb; i++)
193 {
194 for(j = 1; j <= lena; j++)
195 {
196 matrix[i][j] = min3(matrix[i-1][j-1]+ (*(a-j+1) != *(b-i+1)),matrix[i][j-1]+1 ,matrix[i-1][j]+1);
197 }
198 }
199
200 return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]);
201 }
202
203 char fastEditDistance(int per1, int per2)
204 {
205
206 int i = 0;
207 int j = 0;
208
209 char str1[7];
210 char str2[7];
211
212 int val1 = per1;
213 int val2 = per2;
214
215 int index = 0;
216 int mod = 0;
217
218 int matrix[7][7];
219
220 int min = 20;
221
222 while(index < 6)
223 {
224 mod = val1%5;
225 str1[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N'));
226 val1 = val1 /5;
227 index++;
228 }
229
230 str1[6] = '\0';
231
232 index = 0;
233 while(index < 6)
234 {
235 mod=val2%5;
236 str2[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N'));
237 val2 = val2 / 5;
238 index++;
239 }
240 str2[6] = '\0';
241
242 for(i = 0; i < 7; i++)
243 {
244 matrix[0][i] = i;
245 matrix[i][0] = i;
246 }
247
248 for(i = 1; i < 7; i++)
249 {
250 for(j = 1; j < 7; j++)
251 {
252 matrix[i][j] = min3(matrix[i-1][j-1]+ (str1[i-1] != str2[j-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1);
253 }
254 }
255
256 for(i = 0; i < 7; i++)
257 {
258 if(matrix[i][6] < min)
259 min = matrix[i][6];
260 }
261
262 for(i = 0; i < 7; i++)
263 {
264 if(matrix[6][i] < min)
265 min = matrix[6][i];
266 }
267 return min;
268 }
269
270 void initLookUpTable()
271 {
272 int i = 0;
273
274 MASK = _mm_insert_epi16(MASK,1,0);
275 MASK = _mm_insert_epi16(MASK,1,1);
276 MASK = _mm_insert_epi16(MASK,1,2);
277 MASK = _mm_insert_epi16(MASK,1,3);
278 MASK = _mm_insert_epi16(MASK,1,4);
279 MASK = _mm_insert_epi16(MASK,0,5);
280 MASK = _mm_insert_epi16(MASK,0,6);
281 MASK = _mm_insert_epi16(MASK,0,7);
282
283 for(i = 0 ; i < errThreshold + 1; i++)
284 {
285 scoreF[0][i] = i;
286 scoreF[i][0] = i;
287 }
288
289 for(i = 0 ; i < errThreshold + 1; i++)
290 {
291 scoreB[0][i] = i;
292 scoreB[i][0] = i;
293 }
294
295
296 }
297
298 int backwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb)
299 {
300 if(lenb == 0 || lena == 0)
301 return 0;
302
303 int i = 0;
304 int j = 0;
305 int k = 0;
306
307
308 int e = errThreshold;
309
310 char flag = 0;
311
312 int minError = 2*e;
313
314 __m128i R0, R1;
315 __m128i Diag;
316 __m128i Side1, Side2;
317 __m128i Down1, Down2;
318 __m128i Error;
319 __m128i tmp;
320
321 /* initialize */
322 R0 = _mm_setzero_si128 ();
323 R1 = _mm_setzero_si128 ();
324 Diag = _mm_setzero_si128 ();
325 Side1 = _mm_setzero_si128 ();
326 Side2 = _mm_setzero_si128 ();
327 Down1 = _mm_setzero_si128 ();
328 Down2 = _mm_setzero_si128 ();
329 Error = _mm_setzero_si128 ();
330 tmp = _mm_setzero_si128 ();
331 /* end initialize */
332
333 if(lenb <= e)
334 {
335 return smallEditDistanceB(a,lena,b,lenb);
336 }
337
338
339 R1 = _mm_xor_si128(R1, R1);
340 R0 = _mm_xor_si128(R0, R0);
341
342 Diag = _mm_xor_si128(Diag, Diag);
343 Side1 = _mm_xor_si128(Side1, Side1);
344 Down1 = _mm_xor_si128(Down1, Down1);
345
346 Diag = _mm_insert_epi16(Diag,2*e,0);
347
348 Side1 = _mm_insert_epi16(Side1,1,0);
349 Side1 = _mm_insert_epi16(Side1,2*e,1);
350
351 Down1 = _mm_insert_epi16(Down1,2*e,0);
352 Down1 = _mm_insert_epi16(Down1,1,1);
353 Down1 = _mm_insert_epi16(Down1,2*e,2);
354
355 R0 = _mm_insert_epi16(R0,0,0);
356
357 R1 = _mm_insert_epi16(R1,1,0);
358 R1 = _mm_insert_epi16(R1,1,1);
359
360 for(i=2; i <= e; i++)
361 {
362 //set side
363 Side1 = _mm_slli_si128(Side1,2);
364 Side1 = _mm_insert_epi16(Side1,1,0);
365
366 Down1 = _mm_insert_epi16(Down1,1,0);
367 Down1 = _mm_slli_si128(Down1,2);
368 Down1 = _mm_insert_epi16(Down1,2*e,0);
369
370 Diag = _mm_xor_si128(Diag, Diag);
371 if( i%2 == 0)
372 {
373 Diag = _mm_insert_epi16(Diag,2*e,0);
374
375 for(j=1;j<=i-1;j++)
376 {
377 Diag = _mm_slli_si128(Diag, 2);
378 Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0);
379 }
380 Diag = _mm_slli_si128(Diag, 2);
381 Diag = _mm_insert_epi16(Diag, 2*e,0);
382
383 R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag);
384 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1);
385 }
386
387 else
388 {
389 Diag = _mm_insert_epi16(Diag,2*e,0);
390 for(j=i/2-1;j>=-i/2;j--)
391 {
392 Diag = _mm_slli_si128(Diag, 2);
393 Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0);
394 }
395 Diag = _mm_slli_si128(Diag, 2);
396 Diag = _mm_insert_epi16(Diag, 2*e,0);
397
398 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
399 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
400 }
401 }
402 Error = _mm_xor_si128(Error, Error);
403 Side2 = _mm_xor_si128(Side2, Side2);
404 Down2 = _mm_xor_si128(Down2, Down2);
405 Down1 = _mm_xor_si128(Down1, Down1);
406
407 Error = _mm_insert_epi16(Error,e,0);
408 Side1 = _mm_insert_epi16(Side2,2*e,0);
409 Side2 = _mm_insert_epi16(Side2,2*e,0);
410 Down1 = _mm_insert_epi16(Down1,2*e,0);
411
412
413 for(j=0; j < e; j++)
414 {
415 Side2 = _mm_slli_si128(Side2, 2);
416 Side2 = _mm_insert_epi16(Side2,1,0);
417
418 Side1 = _mm_slli_si128(Side1, 2);
419 Side1 = _mm_insert_epi16(Side1,1,0);
420
421 Down1 = _mm_slli_si128(Down1, 2);
422 Down1 = _mm_insert_epi16(Down1,1,0);
423
424 Down2 = _mm_slli_si128(Down2, 2);
425 Down2 = _mm_insert_epi16(Down2,1,0);
426
427 Error = _mm_slli_si128(Error, 2);
428 Error = _mm_insert_epi16(Error, e, 0);
429 }
430
431 Down2= _mm_slli_si128(Down2, 2);
432 Down2 = _mm_insert_epi16(Down2,2*e,0);
433
434 for(; i <= 2*lenb-(e-1);i++)
435 {
436 flag = 0;
437 Diag = _mm_xor_si128(Diag, Diag);
438 if( i%2 == 0)
439 {
440 for(j=e/2;j>=-e/2;j--)
441 {
442 Diag = _mm_slli_si128(Diag, 2);
443 Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0);
444 }
445
446 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
447 R0 = _mm_min_epi16(R0, R1+Down1);
448
449
450 if(_mm_extract_epi16(R0,0) <= e)
451 flag = 1;
452 tmp = _mm_srli_si128(R0,2);
453 for(j=0; j <= e;j++)
454 {
455 if(_mm_extract_epi16(tmp,0) <= e)
456 flag = 1;
457 tmp = _mm_srli_si128(tmp,2);
458 }
459
460 if(flag == 0)
461 return -1;
462
463 if(i == 2*lenb-e)
464 {
465 tmp = _mm_srli_si128(R0,2);
466 for(k=0; k < e-2;k++)
467 tmp = _mm_srli_si128(tmp,2);
468 minError = _mm_extract_epi16(tmp,0);
469 }
470
471 }
472
473 else
474 {
475 for(j=e/2;j>=-e/2-1;j--)
476 {
477 Diag = _mm_slli_si128(Diag, 2);
478 Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i)/2-j-1)),0);
479 }
480
481 // printf("@%d %d %d %d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1), _mm_extract_epi16(Diag,2),
482 // _mm_extract_epi16(Diag,3));
483
484 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
485
486 // printf("#~%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2),
487 // _mm_extract_epi16(R1,3));
488
489 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
490
491 // printf("$%d %d %d %d\n", _mm_extract_epi16(Side2,0), _mm_extract_epi16(Side2,1), _mm_extract_epi16(Side2,2),
492 // _mm_extract_epi16(Side2,3));
493
494 // printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2),
495 // _mm_extract_epi16(R1,3));
496
497
498
499 if(i >= 2*lenb-e)
500 {
501 tmp = _mm_srli_si128(R1,2);
502 for(k=0; k < e-1;k++)
503 tmp = _mm_srli_si128(tmp,2);
504 minError = min(minError, _mm_extract_epi16(tmp,0));
505 }
506 }
507 }
508
509 //first cell
510 Diag = _mm_xor_si128(Diag,Diag);
511 Diag = _mm_insert_epi16(Diag, *(b-(lenb-3)) != *(a-lena), 0);
512 Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena-1)), 1);
513 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-2)), 2);
514 Diag = _mm_insert_epi16(Diag, 2*e, 3);
515 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
516 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
517
518 minError = min(minError, _mm_extract_epi16(R1,2));
519
520 //second cell
521 Diag = _mm_xor_si128(Diag,Diag);
522 Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena)), 0);
523 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-1)), 1);
524 Diag = _mm_insert_epi16(Diag, 2*e, 2);
525
526 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
527 R0 = _mm_min_epi16(R0, R1+Down1);
528
529 minError = min(minError, _mm_extract_epi16(R0,1));
530
531 //third cell
532 Diag = _mm_xor_si128(Diag,Diag);
533 Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena+1)), 0);
534 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena)), 1);
535 Diag = _mm_insert_epi16(Diag, 2*e, 2);
536
537 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
538 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
539
540 minError = min(minError, _mm_extract_epi16(R1,1));
541
542 //forth
543 Diag = _mm_xor_si128(Diag,Diag);
544 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+1)), 0);
545 Diag = _mm_insert_epi16(Diag, 2*e, 1);
546
547 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
548 R0 = _mm_min_epi16(R0, R1+Down1);
549
550 minError = min(minError, _mm_extract_epi16(R0,0));
551
552 //fifth
553 Diag = _mm_xor_si128(Diag,Diag);
554 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+2)), 0);
555 Diag = _mm_insert_epi16(Diag, 2*e, 1);
556
557 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
558 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
559
560 minError = min(minError, _mm_extract_epi16(R0,0));
561
562 if(minError > e)
563 return -1;
564 return minError;
565 }
566
567 int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb)
568 {
569 if(lenb == 0 || lena == 0)
570 return 0;
571
572 int i = 0;
573 int j = 0;
574 int k = 0;
575
576
577 int e = errThreshold;
578
579 char flag = 0;
580
581 int minError = 2*e;
582
583 __m128i R0, R1;
584 __m128i Diag;
585 __m128i Side1, Side2;
586 __m128i Down1, Down2;
587 __m128i Error;
588 __m128i tmp;
589
590 /* initialize */
591 R0 = _mm_setzero_si128 ();
592 R1 = _mm_setzero_si128 ();
593 Diag = _mm_setzero_si128 ();
594 Side1 = _mm_setzero_si128 ();
595 Side2 = _mm_setzero_si128 ();
596 Down1 = _mm_setzero_si128 ();
597 Down2 = _mm_setzero_si128 ();
598 Error = _mm_setzero_si128 ();
599 tmp = _mm_setzero_si128 ();
600 /* end initialize */
601
602 if(lenb <= e)
603 {
604 return smallEditDistanceB(a,lena,b,lenb);
605 }
606
607
608 R1 = _mm_xor_si128(R1, R1);
609 R0 = _mm_xor_si128(R0, R0);
610
611 Diag = _mm_xor_si128(Diag, Diag);
612 Side1 = _mm_xor_si128(Side1, Side1);
613 Down1 = _mm_xor_si128(Down1, Down1);
614
615 Diag = _mm_insert_epi16(Diag,2*e,0);
616
617 Side1 = _mm_insert_epi16(Side1,1,0);
618 Side1 = _mm_insert_epi16(Side1,2*e,1);
619
620 Down1 = _mm_insert_epi16(Down1,2*e,0);
621 Down1 = _mm_insert_epi16(Down1,1,1);
622 Down1 = _mm_insert_epi16(Down1,2*e,2);
623
624 R0 = _mm_insert_epi16(R0,0,0);
625
626 R1 = _mm_insert_epi16(R1,1,0);
627 R1 = _mm_insert_epi16(R1,1,1);
628
629 for(i=2; i <= e; i++)
630 {
631 //set side
632 Side1 = _mm_slli_si128(Side1,2);
633 Side1 = _mm_insert_epi16(Side1,1,0);
634
635 Down1 = _mm_insert_epi16(Down1,1,0);
636 Down1 = _mm_slli_si128(Down1,2);
637 Down1 = _mm_insert_epi16(Down1,2*e,0);
638
639 Diag = _mm_xor_si128(Diag, Diag);
640 if( i%2 == 0)
641 {
642 Diag = _mm_insert_epi16(Diag,2*e,0);
643
644 for(j=1;j<=i-1;j++)
645 {
646 Diag = _mm_slli_si128(Diag, 2);
647 Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0);
648 }
649 Diag = _mm_slli_si128(Diag, 2);
650 Diag = _mm_insert_epi16(Diag, 2*e,0);
651
652 R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag);
653 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1);
654 }
655
656 else
657 {
658 Diag = _mm_insert_epi16(Diag,2*e,0);
659 for(j=i/2-1;j>=-i/2;j--)
660 {
661 Diag = _mm_slli_si128(Diag, 2);
662 Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0);
663 }
664 Diag = _mm_slli_si128(Diag, 2);
665 Diag = _mm_insert_epi16(Diag, 2*e,0);
666
667 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
668 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
669 }
670 }
671 Error = _mm_xor_si128(Error, Error);
672 Side2 = _mm_xor_si128(Side2, Side2);
673 Down2 = _mm_xor_si128(Down2, Down2);
674 Down1 = _mm_xor_si128(Down1, Down1);
675
676 Error = _mm_insert_epi16(Error,e,0);
677 Side2 = _mm_insert_epi16(Side2,2*e,0);
678 Down1 = _mm_insert_epi16(Down1,2*e,0);
679
680
681 for(j=0; j < e; j++)
682 {
683 Side2 = _mm_slli_si128(Side2, 2);
684 Side2 = _mm_insert_epi16(Side2,1,0);
685
686 Down1 = _mm_slli_si128(Down1, 2);
687 Down1 = _mm_insert_epi16(Down1,1,0);
688
689 Down2 = _mm_slli_si128(Down2, 2);
690 Down2 = _mm_insert_epi16(Down2,1,0);
691
692 Error = _mm_slli_si128(Error, 2);
693 Error = _mm_insert_epi16(Error, e, 0);
694 }
695
696 Down2= _mm_slli_si128(Down2, 2);
697 Down2 = _mm_insert_epi16(Down2,2*e,0);
698
699 for(; i <= 2*lenb-(e-1);i++)
700 {
701 flag = 0;
702 Diag = _mm_xor_si128(Diag, Diag);
703 if( i%2 == 0)
704 {
705 for(j=e/2;j>=-e/2;j--)
706 {
707 Diag = _mm_slli_si128(Diag, 2);
708 Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0);
709 }
710
711 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
712 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
713
714 if(_mm_extract_epi16(R0,0) <= e)
715 flag = 1;
716 tmp = _mm_srli_si128(R0,2);
717 for(j=0; j <= e;j++)
718 {
719 if(_mm_extract_epi16(tmp,0) <= e)
720 flag = 1;
721 tmp = _mm_srli_si128(tmp,2);
722 }
723
724 if(flag == 0)
725 return -1;
726
727 if(i == 2*lenb-e)
728 {
729 tmp = _mm_srli_si128(R0,2);
730 for(k=0; k < e-1;k++)
731 tmp = _mm_srli_si128(tmp,2);
732 minError = _mm_extract_epi16(tmp,0);
733 }
734
735 }
736
737 else
738 {
739 for(j=-e/2+1;j<=e/2;j++)
740 {
741 Diag = _mm_slli_si128(Diag, 2);
742 Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2-j-1)) != *(a-((i-1)/2+j-1)),0);
743 }
744
745 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
746 R1 = _mm_min_epi16(R1, R0+Down1);
747
748
749 if(i >= 2*lenb-e)
750 {
751 tmp = _mm_srli_si128(R1,2);
752 for(k=0; k < e-2;k++)
753 tmp = _mm_srli_si128(tmp,2);
754 minError = min(minError, _mm_extract_epi16(tmp,0));
755 }
756 }
757 }
758
759 j=0;
760 int tmpE = e;
761 for(;j<2*(e-2)+1;j++)
762 {
763
764 Diag = _mm_xor_si128(Diag, Diag);
765 //set the first element
766 if(j==0)
767 {
768 for( k=0;k<=e-1;k++ )
769 {
770 Diag = _mm_slli_si128(Diag, 2);
771 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
772 }
773
774 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
775 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
776
777
778 tmpE--;
779 tmp = _mm_srli_si128(R0,2);
780 for(k=0; k < e-2;k++)
781 tmp = _mm_srli_si128(tmp,2);
782 minError = min(minError, _mm_extract_epi16(tmp,0));
783 }
784 else if(j%2 == 0)
785 {
786 for(k=0;k<tmpE;k++)
787 {
788 Diag = _mm_slli_si128(Diag, 2);
789 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
790 }
791
792 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
793 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
794
795 tmpE--;
796
797 tmp = _mm_srli_si128(R0,2);
798 for(k=0; k < tmpE-1;k++)
799 tmp = _mm_srli_si128(tmp,2);
800 minError = min(minError, _mm_extract_epi16(tmp,0));
801 }
802
803
804 else
805 {
806 for(k=0;k<tmpE;k++)
807 {
808 Diag = _mm_slli_si128(Diag, 2);
809 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
810 }
811
812 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
813 R1 = _mm_min_epi16(R1, R0+Down1);
814
815 tmp = _mm_srli_si128(R1,2);
816 for(k=0; k < tmpE-2;k++)
817 tmp = _mm_srli_si128(tmp,2);
818 minError = min(minError, _mm_extract_epi16(tmp,0));
819 }
820 i++;
821 }
822 //Diag
823
824 Diag = _mm_xor_si128(Diag,Diag);
825 Diag = _mm_insert_epi16(Diag, 2*e, 0);
826 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1);
827
828 Side1 = _mm_insert_epi16(Side1,1,0);
829 Side1 = _mm_insert_epi16(Side1,1,1);
830
831 Down1 = _mm_insert_epi16(Down1, 2*e, 0);
832 Down1 = _mm_insert_epi16(Down1, 1, 1);
833
834 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
835 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
836
837 minError = min(minError, _mm_extract_epi16(R1,1));
838
839 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0);
840 Down1 = _mm_insert_epi16(Down1, 1, 0);
841
842 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
843 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
844
845 minError = min(minError, _mm_extract_epi16(R0,0));
846
847 if(minError > e)
848 return -1;
849 return minError;
850 }
851
852 inline int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb)
853 {
854 if(lenb == 0 || lena == 0)
855 return 0;
856
857 int i = 0;
858 int j = 0;
859 int k = 0;
860
861 int i0;
862 int i1;
863 int i2;
864 int i4;
865 int i5;
866
867 int e = 4;
868 int mismatch = errThreshold;
869
870 int minError = 2*errThreshold;
871 int index = 0;
872 int tmpValue = 0;
873
874 if(lenb <= e)
875 {
876 return smallEditDistanceB(a,lena,b,lenb);
877 }
878
879
880 __m128i R0, R1;
881 __m128i Diag;
882 __m128i Side1, Side2;
883 __m128i Down1, Down2;
884 __m128i tmp;
885 __m128i SeqA, SeqB;
886 __m128i Result;
887
888 /* initialize */
889 R0 = _mm_setzero_si128 ();
890 R1 = _mm_setzero_si128 ();
891 Diag = _mm_setzero_si128 ();
892 Side1 = _mm_setzero_si128 ();
893 Side2 = _mm_setzero_si128 ();
894 Down1 = _mm_setzero_si128 ();
895 Down2 = _mm_setzero_si128 ();
896 SeqA = _mm_setzero_si128 ();
897 SeqB = _mm_setzero_si128 ();
898 Result = _mm_setzero_si128 ();
899 /* end initialize */
900
901 R1 = _mm_xor_si128(R1, R1);
902 R0 = _mm_xor_si128(R0, R0);
903
904 Diag = _mm_xor_si128(Diag, Diag);
905 Diag = _mm_insert_epi16(Diag,minError,0);
906
907 i0 = (a[0] != b[0]);
908 i1 = min(i0, ( *(a-1)!=*b) )+1;
909 i2 = min(i0,( a[0] != *(b-1) ) )+1;
910
911 i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1);
912 i4 = min(i1, ( *(a-2)!=b[0] )+1)+1;
913 i5 = min(i2, (a[0] != *(b-2))+1)+1;
914
915 R1 = _mm_insert_epi16(R1, 3, 0);
916 R1 = _mm_insert_epi16(R1, i1, 1);
917 R1 = _mm_insert_epi16(R1, i2, 2);
918 R1 = _mm_insert_epi16(R1, 3, 3);
919
920
921 R0 = _mm_insert_epi16(R0, 4, 0);
922 R0 = _mm_insert_epi16(R0, i4, 1);
923 R0 = _mm_insert_epi16(R0, i0, 2);
924 R0 = _mm_insert_epi16(R0, i5, 3);
925 R0 = _mm_insert_epi16(R0, 4, 4);
926
927
928 Side2 = _mm_xor_si128(Side2, Side2);
929 Down2 = _mm_xor_si128(Down2, Down2);
930 Down1 = _mm_xor_si128(Down1, Down1);
931 Side1 = _mm_xor_si128(Side1, Side1);
932
933 Side2 = _mm_insert_epi16(Side2,minError,0);
934 Down1 = _mm_insert_epi16(Down1,minError,0);
935
936 Side1 = _mm_insert_epi16(Side1,1,0);
937
938 index = 0;
939 for(j=0; j < e; j++)
940 {
941 Side2 = _mm_slli_si128(Side2, 2);
942 Side2 = _mm_insert_epi16(Side2,1,0);
943
944 Down1 = _mm_slli_si128(Down1, 2);
945 Down1 = _mm_insert_epi16(Down1,1,0);
946
947 Down2 = _mm_slli_si128(Down2, 2);
948 Down2 = _mm_insert_epi16(Down2,1,0);
949
950 Side1 = _mm_slli_si128(Side1, 2);
951 Side1 = _mm_insert_epi16(Side1,1,0);
952
953 SeqA = _mm_slli_si128(SeqA, 2);
954 SeqB = _mm_slli_si128(SeqB, 2);
955 SeqA = _mm_insert_epi16(SeqA,*(a-index),0);
956 SeqB = _mm_insert_epi16(SeqB,*(b-index),0);
957 index++;
958 }
959
960 Down2= _mm_slli_si128(Down2, 2);
961 Down2 = _mm_insert_epi16(Down2,minError,0);
962
963 index = 4;
964 i = 5;
965
966 int loopEnd = 2*lenb-(e-1);
967 for(; i <= loopEnd ;i++)
968 {
969
970 Diag = _mm_xor_si128(Diag, Diag);
971 if( i%2 == 0)
972 {
973 SeqA = _mm_slli_si128(SeqA, 2);
974 SeqB = _mm_slli_si128(SeqB, 2);
975 SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0);
976 SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0);
977
978 index++;
979
980 tmp = _mm_shufflelo_epi16(SeqB,27);
981 tmp = _mm_slli_si128(tmp, 2);
982 tmpValue = _mm_extract_epi16(tmp, 5);
983 tmp = _mm_insert_epi16(tmp, tmpValue, 0);
984
985 Result = _mm_cmpeq_epi16(SeqA, tmp);
986 Diag = _mm_andnot_si128(Result, MASK);
987
988 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
989 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
990
991 if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold
992 && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && _mm_extract_epi16(R1, 0) > errThreshold
993 && _mm_extract_epi16(R1, 1) > errThreshold && _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold)
994 return -1;
995
996 if(i == 2*lenb-e)
997 {
998 tmp = _mm_srli_si128(R0,2);
999 for(k=0; k < e-1;k++)
1000 tmp = _mm_srli_si128(tmp,2);
1001 minError = _mm_extract_epi16(tmp,0);
1002 }
1003
1004 }
1005
1006 else
1007 {
1008 Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27));
1009 Diag = _mm_andnot_si128(Result, MASK);
1010
1011 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1012 R1 = _mm_min_epi16(R1, R0+Down1);
1013
1014
1015 if(i >= 2*lenb-e)
1016 {
1017 tmp = _mm_srli_si128(R1,2);
1018 for(k=0; k < e-2;k++)
1019 tmp = _mm_srli_si128(tmp,2);
1020 minError = min(minError, _mm_extract_epi16(tmp,0));
1021 }
1022 }
1023
1024
1025 }
1026
1027 j=0;
1028 int tmpE = e;
1029 for(;j<2*(e-2)+1;j++)
1030 {
1031
1032 Diag = _mm_xor_si128(Diag, Diag);
1033 //set the first element
1034 if(j==0)
1035 {
1036 for( k=0;k<=e-1;k++ )
1037 {
1038 Diag = _mm_slli_si128(Diag, 2);
1039 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1040 }
1041
1042 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1043 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1044
1045 tmpE--;
1046
1047 tmp = _mm_srli_si128(R0,2);
1048 for(k=0; k < e-2;k++)
1049 tmp = _mm_srli_si128(tmp,2);
1050 minError = min(minError, _mm_extract_epi16(tmp,0));
1051 }
1052 else if(j%2 == 0)
1053 {
1054 for(k=0;k<tmpE;k++)
1055 {
1056 Diag = _mm_slli_si128(Diag, 2);
1057 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1058 }
1059
1060 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1061 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1062
1063 tmpE--;
1064
1065 tmp = _mm_srli_si128(R0,2);
1066 for(k=0; k < tmpE-1;k++)
1067 tmp = _mm_srli_si128(tmp,2);
1068 minError = min(minError, _mm_extract_epi16(tmp,0));
1069 }
1070
1071
1072 else
1073 {
1074 for(k=0;k<tmpE;k++)
1075 {
1076 Diag = _mm_slli_si128(Diag, 2);
1077 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1078 }
1079
1080 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1081 R1 = _mm_min_epi16(R1, R0+Down1);
1082
1083 tmp = _mm_srli_si128(R1,2);
1084 for(k=0; k < tmpE-2;k++)
1085 tmp = _mm_srli_si128(tmp,2);
1086 minError = min(minError, _mm_extract_epi16(tmp,0));
1087 }
1088 i++;
1089 }
1090 //Diag
1091
1092 Diag = _mm_xor_si128(Diag,Diag);
1093 Diag = _mm_insert_epi16(Diag, 2*errThreshold, 0);
1094 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1);
1095
1096 Side1 = _mm_insert_epi16(Side1,1,0);
1097 Side1 = _mm_insert_epi16(Side1,1,1);
1098
1099 Down1 = _mm_insert_epi16(Down1, 2*errThreshold, 0);
1100 Down1 = _mm_insert_epi16(Down1, 1, 1);
1101
1102 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
1103 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
1104
1105 minError = min(minError, _mm_extract_epi16(R1,1));
1106
1107 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0);
1108 Down1 = _mm_insert_epi16(Down1, 1, 0);
1109
1110 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
1111 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
1112
1113 minError = min(minError, _mm_extract_epi16(R0,0));
1114
1115 if(minError > mismatch)
1116 return -1;
1117 return minError;
1118 }
1119
1120 int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb)
1121 {
1122 if(lenb == 0 || lena == 0)
1123 return 0;
1124
1125 int i = 0;
1126 int j = 0;
1127 int k = 0;
1128
1129 int i0;
1130 int i1;
1131 int i2;
1132 int i4;
1133 int i5;
1134
1135 int e = errThreshold;
1136
1137 int minError = 2*e;
1138 int index = 0;
1139 int tmpValue = 0;
1140
1141 if(lenb <= e)
1142 {
1143 return smallEditDistanceB(a,lena,b,lenb);
1144 }
1145
1146 __m128i R0, R1;
1147 __m128i Diag;
1148 __m128i Side1, Side2;
1149 __m128i Down1, Down2;
1150 __m128i tmp;
1151 __m128i SeqA, SeqB;
1152 __m128i Result;
1153
1154 /* initialize */
1155 R0 = _mm_setzero_si128 ();
1156 R1 = _mm_setzero_si128 ();
1157 Diag = _mm_setzero_si128 ();
1158 Side1 = _mm_setzero_si128 ();
1159 Side2 = _mm_setzero_si128 ();
1160 Down1 = _mm_setzero_si128 ();
1161 Down2 = _mm_setzero_si128 ();
1162 SeqA = _mm_setzero_si128 ();
1163 SeqB = _mm_setzero_si128 ();
1164 Result = _mm_setzero_si128 ();
1165 /* end initialize */
1166
1167 R1 = _mm_xor_si128(R1, R1);
1168 R0 = _mm_xor_si128(R0, R0);
1169
1170 Diag = _mm_xor_si128(Diag, Diag);
1171 Diag = _mm_insert_epi16(Diag,2*e,0);
1172
1173 i0 = (a[0] != b[0]);
1174 i1 = min(i0, ( *(a-1)!=*b) )+1;
1175 i2 = min(i0,( a[0] != *(b-1) ) )+1;
1176
1177 i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1);
1178 i4 = min(i1, ( *(a-2)!=b[0] )+1)+1;
1179 i5 = min(i2, (a[0] != *(b-2))+1)+1;
1180
1181 R1 = _mm_insert_epi16(R1, 3, 0);
1182 R1 = _mm_insert_epi16(R1, i1, 1);
1183 R1 = _mm_insert_epi16(R1, i2, 2);
1184 R1 = _mm_insert_epi16(R1, 3, 3);
1185
1186
1187 R0 = _mm_insert_epi16(R0, 4, 0);
1188 R0 = _mm_insert_epi16(R0, i4, 1);
1189 R0 = _mm_insert_epi16(R0, i0, 2);
1190 R0 = _mm_insert_epi16(R0, i5, 3);
1191 R0 = _mm_insert_epi16(R0, 4, 4);
1192
1193 Side2 = _mm_xor_si128(Side2, Side2);
1194 Down2 = _mm_xor_si128(Down2, Down2);
1195 Down1 = _mm_xor_si128(Down1, Down1);
1196 Side1 = _mm_xor_si128(Side1, Side1);
1197
1198 Side2 = _mm_insert_epi16(Side2,2*e,0);
1199 Down1 = _mm_insert_epi16(Down1,2*e,0);
1200
1201 Side1 = _mm_insert_epi16(Side1,1,0);
1202
1203 index = 0;
1204 for(j=0; j < e; j++)
1205 {
1206 Side2 = _mm_slli_si128(Side2, 2);
1207 Side2 = _mm_insert_epi16(Side2,1,0);
1208
1209 Down1 = _mm_slli_si128(Down1, 2);
1210 Down1 = _mm_insert_epi16(Down1,1,0);
1211
1212 Down2 = _mm_slli_si128(Down2, 2);
1213 Down2 = _mm_insert_epi16(Down2,1,0);
1214
1215 Side1 = _mm_slli_si128(Side1, 2);
1216 Side1 = _mm_insert_epi16(Side1,1,0);
1217
1218 SeqA = _mm_slli_si128(SeqA, 2);
1219 SeqB = _mm_slli_si128(SeqB, 2);
1220 SeqA = _mm_insert_epi16(SeqA,*(a-index),0);
1221 SeqB = _mm_insert_epi16(SeqB,*(b-index),0);
1222 index++;
1223 }
1224
1225 Down2= _mm_slli_si128(Down2, 2);
1226 Down2 = _mm_insert_epi16(Down2,2*e,0);
1227
1228 index = 4;
1229 i = 5;
1230 int loopEnd = 2*lenb-(e-1);
1231 for(; i <= loopEnd ;i++)
1232 {
1233
1234 Diag = _mm_xor_si128(Diag, Diag);
1235 if( i%2 == 0)
1236 {
1237 SeqA = _mm_slli_si128(SeqA, 2);
1238 SeqB = _mm_slli_si128(SeqB, 2);
1239 SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0);
1240 SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0);
1241
1242 index++;
1243
1244 tmp = _mm_shufflelo_epi16(SeqB,27);
1245 tmp = _mm_slli_si128(tmp, 2);
1246 tmpValue = _mm_extract_epi16(tmp, 5);
1247 tmp = _mm_insert_epi16(tmp, tmpValue, 0);
1248
1249 Result = _mm_cmpeq_epi16(SeqA, tmp);
1250 Diag = _mm_andnot_si128(Result, MASK);
1251
1252 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1253 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1254
1255 //tmp = _mm_sub_epi16(Error, R0);
1256 //i0 = _mm_movemask_epi8(tmp);
1257
1258 if( _mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e
1259 && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e &&
1260 _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e )
1261 return -1;
1262
1263 if(i == 2*lenb-e)
1264 {
1265 tmp = _mm_srli_si128(R0,2);
1266 for(k=0; k < e-1;k++)
1267 tmp = _mm_srli_si128(tmp,2);
1268 minError = _mm_extract_epi16(tmp,0);
1269 }
1270
1271 }
1272
1273 else
1274 {
1275 Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27));
1276 Diag = _mm_andnot_si128(Result, MASK);
1277
1278 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1279 R1 = _mm_min_epi16(R1, R0+Down1);
1280
1281 if(i >= 2*lenb-e)
1282 {
1283 tmp = _mm_srli_si128(R1,2);
1284 for(k=0; k < e-2;k++)
1285 tmp = _mm_srli_si128(tmp,2);
1286 minError = min(minError, _mm_extract_epi16(tmp,0));
1287 }
1288 }
1289
1290
1291 }
1292
1293 j=0;
1294
1295 int tmpE = e;
1296
1297 for(;j<2*(e-2)+1;j++)
1298 {
1299
1300 Diag = _mm_xor_si128(Diag, Diag);
1301 //set the first element
1302 if(j==0)
1303 {
1304 for( k=0;k<=e-1;k++ )
1305 {
1306 Diag = _mm_slli_si128(Diag, 2);
1307 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1308 }
1309
1310 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1311 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1312
1313 tmpE--;
1314
1315 tmp = _mm_srli_si128(R0,2);
1316 for(k=0; k < e-2;k++)
1317 tmp = _mm_srli_si128(tmp,2);
1318 minError = min(minError, _mm_extract_epi16(tmp,0));
1319 }
1320 else if(j%2 == 0)
1321 {
1322 for(k=0;k<tmpE;k++)
1323 {
1324 Diag = _mm_slli_si128(Diag, 2);
1325 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1326 }
1327
1328 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1329 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1330
1331 tmpE--;
1332
1333 tmp = _mm_srli_si128(R0,2);
1334 for(k=0; k < tmpE-1;k++)
1335 tmp = _mm_srli_si128(tmp,2);
1336 minError = min(minError, _mm_extract_epi16(tmp,0));
1337 }
1338
1339
1340 else
1341 {
1342 for(k=0;k<tmpE;k++)
1343 {
1344 Diag = _mm_slli_si128(Diag, 2);
1345 Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0);
1346 }
1347
1348 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1349 R1 = _mm_min_epi16(R1, R0+Down1);
1350
1351 tmp = _mm_srli_si128(R1,2);
1352 for(k=0; k < tmpE-2;k++)
1353 tmp = _mm_srli_si128(tmp,2);
1354 minError = min(minError, _mm_extract_epi16(tmp,0));
1355 }
1356 i++;
1357 }
1358 //Diag
1359
1360 Diag = _mm_xor_si128(Diag,Diag);
1361 Diag = _mm_insert_epi16(Diag, 2*e, 0);
1362 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1);
1363
1364 Side1 = _mm_insert_epi16(Side1,1,0);
1365 Side1 = _mm_insert_epi16(Side1,1,1);
1366
1367 Down1 = _mm_insert_epi16(Down1, 2*e, 0);
1368 Down1 = _mm_insert_epi16(Down1, 1, 1);
1369
1370 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
1371 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
1372
1373 minError = min(minError, _mm_extract_epi16(R1,1));
1374
1375 Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0);
1376 Down1 = _mm_insert_epi16(Down1, 1, 0);
1377
1378 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
1379 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
1380
1381 minError = min(minError, _mm_extract_epi16(R0,0));
1382
1383 if(minError > e)
1384 return -1;
1385 return minError;
1386 }
1387
1388 inline int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb)
1389 {
1390 if(lenb == 0 || lena == 0)
1391 return 0;
1392
1393 int i = 0;
1394 int j = 0;
1395 int k = 0;
1396
1397 int i0=0;
1398 int i1=0;
1399 int i2=0;
1400 int i4=0;
1401 int i5=0;
1402
1403 int mismatch = errThreshold;
1404 int e = 4;
1405
1406 int minError = 4*mismatch+1;
1407 int index = 0;
1408 int tmpValue = 0;
1409
1410 if(lenb <= e)
1411 {
1412 return smallEditDistanceF(a,lena,b,lenb);
1413 }
1414
1415
1416 register __m128i R0, R1;
1417 __m128i Diag;
1418 __m128i Side1, Side2;
1419 __m128i Down1, Down2;
1420 __m128i tmp;
1421 register __m128i SeqA, SeqB;
1422 __m128i Result;
1423
1424 __m128i tmpSeqA;
1425 __m128i tmpSeqB;
1426
1427 /* initialize */
1428 R0 = _mm_setzero_si128 ();
1429 R1 = _mm_setzero_si128 ();
1430 Diag = _mm_setzero_si128 ();
1431 Side1 = _mm_setzero_si128 ();
1432 Side2 = _mm_setzero_si128 ();
1433 Down1 = _mm_setzero_si128 ();
1434 Down2 = _mm_setzero_si128 ();
1435 SeqA = _mm_setzero_si128 ();
1436 SeqB = _mm_setzero_si128 ();
1437 Result = _mm_setzero_si128 ();
1438 /* end initialize */
1439
1440
1441 R1 = _mm_xor_si128(R1, R1);
1442 R0 = _mm_xor_si128(R0, R0);
1443
1444 Diag = _mm_xor_si128(Diag, Diag);
1445 Diag = _mm_insert_epi16(Diag,minError,0);
1446
1447 i0 = (a[0] != b[0]);
1448 i1 = min(i0, (a[1]!=b[0]))+1;
1449 i2 = min(i0,(a[0]!=b[1]))+1;
1450
1451 i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1);
1452 i4 = min(i1, (a[2]!=b[0])+1)+1;
1453 i5 = min(i2, (a[0]!=b[2])+1)+1;
1454
1455 R1 = _mm_insert_epi16(R1, 3, 0);
1456 R1 = _mm_insert_epi16(R1, i1, 1);
1457 R1 = _mm_insert_epi16(R1, i2, 2);
1458 R1 = _mm_insert_epi16(R1, 3, 3);
1459
1460 R0 = _mm_insert_epi16(R0, 4, 0);
1461 R0 = _mm_insert_epi16(R0, i4, 1);
1462 R0 = _mm_insert_epi16(R0, i0, 2);
1463 R0 = _mm_insert_epi16(R0, i5, 3);
1464 R0 = _mm_insert_epi16(R0, 4, 4);
1465
1466 Side2 = _mm_xor_si128(Side2, Side2);
1467 Down2 = _mm_xor_si128(Down2, Down2);
1468 Down1 = _mm_xor_si128(Down1, Down1);
1469 Side1 = _mm_xor_si128(Side1, Side1);
1470
1471 Side2 = _mm_insert_epi16(Side2,minError,0);
1472 Down1 = _mm_insert_epi16(Down1,minError,0);
1473
1474 Side1 = _mm_insert_epi16(Side1,1,0);
1475
1476 index = 0;
1477 for(j=0; j < e; j++)
1478 {
1479 Side2 = _mm_slli_si128(Side2, 2);
1480 Side2 = _mm_insert_epi16(Side2,1,0);
1481
1482 Down1 = _mm_slli_si128(Down1, 2);
1483 Down1 = _mm_insert_epi16(Down1,1,0);
1484
1485 Down2 = _mm_slli_si128(Down2, 2);
1486 Down2 = _mm_insert_epi16(Down2,1,0);
1487
1488 Side1 = _mm_slli_si128(Side1, 2);
1489 Side1 = _mm_insert_epi16(Side1,1,0);
1490
1491 SeqA = _mm_slli_si128(SeqA, 2);
1492 SeqB = _mm_slli_si128(SeqB, 2);
1493 SeqA = _mm_insert_epi16(SeqA,a[index],0);
1494 SeqB = _mm_insert_epi16(SeqB,b[index],0);
1495 index++;
1496 }
1497
1498 Down2= _mm_slli_si128(Down2, 2);
1499 Down2 = _mm_insert_epi16(Down2,minError,0);
1500
1501 index = 4;
1502 i = 5;
1503
1504 int loopEnd = 2*lenb-(e-1);
1505 for(; i <= loopEnd ;i++)
1506 {
1507 if( i%2 == 0)
1508 {
1509 tmpSeqA = _mm_slli_si128(SeqA, 2);
1510 tmpSeqB = _mm_slli_si128(SeqB, 2);
1511 SeqA = _mm_insert_epi16(tmpSeqA,a[index],0);
1512 SeqB = _mm_insert_epi16(tmpSeqB,b[index],0);
1513
1514 index++;
1515
1516 tmp = _mm_shufflelo_epi16(SeqB,27);
1517 tmp = _mm_slli_si128(tmp, 2);
1518 tmpValue = _mm_extract_epi16(tmp, 5);
1519 tmp = _mm_insert_epi16(tmp, tmpValue, 0);
1520
1521 Result = _mm_cmpeq_epi16(SeqA, tmp);
1522 Diag = _mm_andnot_si128(Result, MASK);
1523
1524 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1525 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1526
1527 if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold
1528 && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold &&
1529 _mm_extract_epi16(R1, 0) > errThreshold && _mm_extract_epi16(R1, 1) > errThreshold &&
1530 _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold)
1531 return -1;
1532
1533 if(i == 2*lenb-e)
1534 {
1535 tmp = _mm_srli_si128(R0,2);
1536 for(k=0; k < e-1;k++)
1537 tmp = _mm_srli_si128(tmp,2);
1538 minError = _mm_extract_epi16(tmp,0);
1539 }
1540
1541 }
1542
1543 else
1544 {
1545 Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27));
1546 Diag = _mm_andnot_si128(Result, MASK);
1547
1548 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1549 R1 = _mm_min_epi16(R1, R0+Down1);
1550
1551 if(i >= 2*lenb-e)
1552 {
1553 tmp = _mm_srli_si128(R1,2);
1554 for(k=0; k < e-2;k++)
1555 tmp = _mm_srli_si128(tmp,2);
1556 minError = min(minError, _mm_extract_epi16(tmp,0));
1557 }
1558 }
1559 }
1560
1561 j=0;
1562 int tmpE = e;
1563 for(;j<2*(e-2)+1;j++)
1564 {
1565
1566 Diag = _mm_xor_si128(Diag, Diag);
1567 //set the first element
1568 if(j==0)
1569 {
1570 for( k=0;k<=e-1;k++ )
1571 {
1572 Diag = _mm_slli_si128(Diag, 2);
1573 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1574 }
1575
1576 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1577 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1578
1579 tmpE--;
1580
1581 tmp = _mm_srli_si128(R0,2);
1582 for(k=0; k < e-2;k++)
1583 tmp = _mm_srli_si128(tmp,2);
1584 minError = min(minError, _mm_extract_epi16(tmp,0));
1585 }
1586 else if(j%2 == 0)
1587 {
1588 for(k=0;k<tmpE;k++)
1589 {
1590 Diag = _mm_slli_si128(Diag, 2);
1591 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1592 }
1593
1594 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1595 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1596
1597 tmpE--;
1598
1599 tmp = _mm_srli_si128(R0,2);
1600 for(k=0; k < tmpE-1;k++)
1601 tmp = _mm_srli_si128(tmp,2);
1602 minError = min(minError, _mm_extract_epi16(tmp,0));
1603 }
1604
1605
1606 else
1607 {
1608 for(k=0;k<tmpE;k++)
1609 {
1610 Diag = _mm_slli_si128(Diag, 2);
1611 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1612 }
1613
1614 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1615 R1 = _mm_min_epi16(R1, R0+Down1);
1616
1617 tmp = _mm_srli_si128(R1,2);
1618 for(k=0; k < tmpE-2;k++)
1619 tmp = _mm_srli_si128(tmp,2);
1620 minError = min(minError, _mm_extract_epi16(tmp,0));
1621 }
1622 i++;
1623 }
1624 //Diag
1625
1626 Diag = _mm_xor_si128(Diag,Diag);
1627 Diag = _mm_insert_epi16(Diag, minError, 0);
1628 Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1);
1629
1630 Side1 = _mm_insert_epi16(Side1,1,0);
1631 Side1 = _mm_insert_epi16(Side1,1,1);
1632
1633 Down1 = _mm_insert_epi16(Down1, minError, 0);
1634 Down1 = _mm_insert_epi16(Down1, 1, 1);
1635
1636 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
1637 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
1638
1639 minError = min(minError, _mm_extract_epi16(R1,1));
1640
1641 Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0);
1642 Down1 = _mm_insert_epi16(Down1, 1, 0);
1643
1644 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
1645 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
1646
1647
1648 minError = min(minError, _mm_extract_epi16(R0,0));
1649
1650
1651 if(minError > mismatch)
1652 return -1;
1653 return minError;
1654 }
1655
1656
1657
1658 int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb)
1659 {
1660 if(lenb == 0 || lena == 0)
1661 return 0;
1662
1663 int i = 0;
1664 int j = 0;
1665 int k = 0;
1666
1667 int i0=0;
1668 int i1=0;
1669 int i2=0;
1670 int i4=0;
1671 int i5=0;
1672
1673 int e = errThreshold;
1674
1675 int minError = 2*e;
1676 int index = 0;
1677 int tmpValue = 0;
1678
1679 if(lenb <= e)
1680 {
1681 return smallEditDistanceF(a,lena,b,lenb);
1682 }
1683
1684
1685 register __m128i R0, R1;
1686 __m128i Diag;
1687 __m128i Side1, Side2;
1688 __m128i Down1, Down2;
1689 __m128i tmp;
1690 register __m128i SeqA, SeqB;
1691 __m128i Result;
1692
1693 __m128i tmpSeqA;
1694 __m128i tmpSeqB;
1695
1696 /* initialize */
1697 R0 = _mm_setzero_si128 ();
1698 R1 = _mm_setzero_si128 ();
1699 Diag = _mm_setzero_si128 ();
1700 Side1 = _mm_setzero_si128 ();
1701 Side2 = _mm_setzero_si128 ();
1702 Down1 = _mm_setzero_si128 ();
1703 Down2 = _mm_setzero_si128 ();
1704 SeqA = _mm_setzero_si128 ();
1705 SeqB = _mm_setzero_si128 ();
1706 Result = _mm_setzero_si128 ();
1707 /* end initialize */
1708
1709 R1 = _mm_xor_si128(R1, R1);
1710 R0 = _mm_xor_si128(R0, R0);
1711
1712 Diag = _mm_xor_si128(Diag, Diag);
1713 Diag = _mm_insert_epi16(Diag,2*e,0);
1714
1715 i0 = (a[0] != b[0]);
1716 i1 = min(i0, (a[1]!=b[0]))+1;
1717 i2 = min(i0,(a[0]!=b[1]))+1;
1718
1719 i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1);
1720 i4 = min(i1, (a[2]!=b[0])+1)+1;
1721 i5 = min(i2, (a[0]!=b[2])+1)+1;
1722
1723 R1 = _mm_insert_epi16(R1, 3, 0);
1724 R1 = _mm_insert_epi16(R1, i1, 1);
1725 R1 = _mm_insert_epi16(R1, i2, 2);
1726 R1 = _mm_insert_epi16(R1, 3, 3);
1727
1728 R0 = _mm_insert_epi16(R0, 4, 0);
1729 R0 = _mm_insert_epi16(R0, i4, 1);
1730 R0 = _mm_insert_epi16(R0, i0, 2);
1731 R0 = _mm_insert_epi16(R0, i5, 3);
1732 R0 = _mm_insert_epi16(R0, 4, 4);
1733
1734 Side2 = _mm_xor_si128(Side2, Side2);
1735 Down2 = _mm_xor_si128(Down2, Down2);
1736 Down1 = _mm_xor_si128(Down1, Down1);
1737 Side1 = _mm_xor_si128(Side1, Side1);
1738
1739 Side2 = _mm_insert_epi16(Side2,2*e,0);
1740 Down1 = _mm_insert_epi16(Down1,2*e,0);
1741
1742 Side1 = _mm_insert_epi16(Side1,1,0);
1743
1744 index = 0;
1745 for(j=0; j < e; j++)
1746 {
1747 Side2 = _mm_slli_si128(Side2, 2);
1748 Side2 = _mm_insert_epi16(Side2,1,0);
1749
1750 Down1 = _mm_slli_si128(Down1, 2);
1751 Down1 = _mm_insert_epi16(Down1,1,0);
1752
1753 Down2 = _mm_slli_si128(Down2, 2);
1754 Down2 = _mm_insert_epi16(Down2,1,0);
1755
1756 Side1 = _mm_slli_si128(Side1, 2);
1757 Side1 = _mm_insert_epi16(Side1,1,0);
1758
1759 SeqA = _mm_slli_si128(SeqA, 2);
1760 SeqB = _mm_slli_si128(SeqB, 2);
1761 SeqA = _mm_insert_epi16(SeqA,a[index],0);
1762 SeqB = _mm_insert_epi16(SeqB,b[index],0);
1763 index++;
1764 }
1765
1766 Down2= _mm_slli_si128(Down2, 2);
1767 Down2 = _mm_insert_epi16(Down2,2*e,0);
1768
1769 index = 4;
1770 i = 5;
1771
1772 int loopEnd = 2*lenb-(e-1);
1773 for(; i <= loopEnd ;i++)
1774 {
1775 //Diag = _mm_xor_si128(Diag, Diag);
1776 if( i%2 == 0)
1777 {
1778 tmpSeqA = _mm_slli_si128(SeqA, 2);
1779 tmpSeqB = _mm_slli_si128(SeqB, 2);
1780 SeqA = _mm_insert_epi16(tmpSeqA,a[index],0);
1781 SeqB = _mm_insert_epi16(tmpSeqB,b[index],0);
1782
1783 index++;
1784
1785 tmp = _mm_shufflelo_epi16(SeqB,27);
1786 tmp = _mm_slli_si128(tmp, 2);
1787 tmpValue = _mm_extract_epi16(tmp, 5);
1788 tmp = _mm_insert_epi16(tmp, tmpValue, 0);
1789
1790 Result = _mm_cmpeq_epi16(SeqA, tmp);
1791 Diag = _mm_andnot_si128(Result, MASK);
1792
1793 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1794 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1795
1796 if(_mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e
1797 && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e &&
1798 _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e)
1799 return -1;
1800
1801 if(i == 2*lenb-e)
1802 {
1803 tmp = _mm_srli_si128(R0,2);
1804 for(k=0; k < e-1;k++)
1805 tmp = _mm_srli_si128(tmp,2);
1806 minError = _mm_extract_epi16(tmp,0);
1807 }
1808
1809 }
1810
1811 else
1812 {
1813 Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27));
1814 Diag = _mm_andnot_si128(Result, MASK);
1815
1816 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1817 R1 = _mm_min_epi16(R1, R0+Down1);
1818
1819 if(i >= 2*lenb-e)
1820 {
1821 tmp = _mm_srli_si128(R1,2);
1822 for(k=0; k < e-2;k++)
1823 tmp = _mm_srli_si128(tmp,2);
1824 minError = min(minError, _mm_extract_epi16(tmp,0));
1825 }
1826 }
1827
1828
1829 }
1830 j=0;
1831 int tmpE = e;
1832 for(;j<2*(e-2)+1;j++)
1833 {
1834
1835 Diag = _mm_xor_si128(Diag, Diag);
1836 //set the first element
1837 if(j==0)
1838 {
1839 for( k=0;k<=e-1;k++ )
1840 {
1841 Diag = _mm_slli_si128(Diag, 2);
1842 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1843 }
1844
1845 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1846 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1847
1848 tmpE--;
1849
1850 tmp = _mm_srli_si128(R0,2);
1851 for(k=0; k < e-2;k++)
1852 tmp = _mm_srli_si128(tmp,2);
1853 minError = min(minError, _mm_extract_epi16(tmp,0));
1854 }
1855 else if(j%2 == 0)
1856 {
1857 for(k=0;k<tmpE;k++)
1858 {
1859 Diag = _mm_slli_si128(Diag, 2);
1860 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1861 }
1862
1863 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
1864 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
1865
1866 tmpE--;
1867
1868 tmp = _mm_srli_si128(R0,2);
1869 for(k=0; k < tmpE-1;k++)
1870 tmp = _mm_srli_si128(tmp,2);
1871 minError = min(minError, _mm_extract_epi16(tmp,0));
1872 }
1873
1874
1875 else
1876 {
1877 for(k=0;k<tmpE;k++)
1878 {
1879 Diag = _mm_slli_si128(Diag, 2);
1880 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
1881 }
1882
1883 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
1884 R1 = _mm_min_epi16(R1, R0+Down1);
1885
1886 tmp = _mm_srli_si128(R1,2);
1887 for(k=0; k < tmpE-2;k++)
1888 tmp = _mm_srli_si128(tmp,2);
1889 minError = min(minError, _mm_extract_epi16(tmp,0));
1890 }
1891 i++;
1892 }
1893 //Diag
1894
1895 Diag = _mm_xor_si128(Diag,Diag);
1896 Diag = _mm_insert_epi16(Diag, 2*e, 0);
1897 Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1);
1898
1899 Side1 = _mm_insert_epi16(Side1,1,0);
1900 Side1 = _mm_insert_epi16(Side1,1,1);
1901
1902 Down1 = _mm_insert_epi16(Down1, 2*e, 0);
1903 Down1 = _mm_insert_epi16(Down1, 1, 1);
1904
1905 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
1906 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
1907
1908 minError = min(minError, _mm_extract_epi16(R1,1));
1909
1910 Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0);
1911 Down1 = _mm_insert_epi16(Down1, 1, 0);
1912
1913 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
1914 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
1915
1916 minError = min(minError, _mm_extract_epi16(R0,0));
1917
1918 if(minError > e)
1919 return -1;
1920 return minError;
1921 }
1922
1923 int forwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb)
1924 {
1925 if(lenb == 0 || lena == 0)
1926 return 0;
1927
1928 int i = 0;
1929 int j = 0;
1930 int k = 0;
1931
1932 int e = errThreshold;
1933
1934 int minError = 2*e;
1935
1936 char flag = 0;
1937
1938 if(lenb <= e)
1939 {
1940 return smallEditDistanceF(a,lena,b,lenb);
1941 }
1942
1943
1944 __m128i R0, R1;
1945 __m128i Diag;
1946 __m128i Side1, Side2;
1947 __m128i Down1, Down2;
1948 __m128i Error;
1949 __m128i tmp;
1950
1951 /* initialize */
1952 R0 = _mm_setzero_si128 ();
1953 R1 = _mm_setzero_si128 ();
1954 Diag = _mm_setzero_si128 ();
1955 Side1 = _mm_setzero_si128 ();
1956 Side2 = _mm_setzero_si128 ();
1957 Down1 = _mm_setzero_si128 ();
1958 Down2 = _mm_setzero_si128 ();
1959 Error = _mm_setzero_si128 ();
1960 tmp = _mm_setzero_si128 ();
1961 /* end initialize */
1962
1963 R1 = _mm_xor_si128(R1, R1);
1964 R0 = _mm_xor_si128(R0, R0);
1965
1966 Diag = _mm_xor_si128(Diag, Diag);
1967 Side1 = _mm_xor_si128(Side1, Side1);
1968 Down1 = _mm_xor_si128(Down1, Down1);
1969
1970 Diag = _mm_insert_epi16(Diag,2*e,0);
1971
1972 Side1 = _mm_insert_epi16(Side1,1,0);
1973 Side1 = _mm_insert_epi16(Side1,2*e,1);
1974
1975 Down1 = _mm_insert_epi16(Down1,2*e,0);
1976 Down1 = _mm_insert_epi16(Down1,1,1);
1977 Down1 = _mm_insert_epi16(Down1,2*e,2);
1978
1979 R0 = _mm_insert_epi16(R0,0,0);
1980
1981 R1 = _mm_insert_epi16(R1,1,0);
1982 R1 = _mm_insert_epi16(R1,1,1);
1983
1984 for(i=2; i <= e; i++)
1985 {
1986 //set side
1987 Side1 = _mm_slli_si128(Side1,2);
1988 Side1 = _mm_insert_epi16(Side1,1,0);
1989
1990 Down1 = _mm_insert_epi16(Down1,1,0);
1991 Down1 = _mm_slli_si128(Down1,2);
1992 Down1 = _mm_insert_epi16(Down1,2*e,0);
1993
1994 Diag = _mm_xor_si128(Diag, Diag);
1995 if( i%2 == 0)
1996 {
1997 Diag = _mm_insert_epi16(Diag,2*e,0);
1998
1999 for(j=1;j<=i-1;j++)
2000 {
2001 Diag = _mm_slli_si128(Diag, 2);
2002 Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0);
2003 }
2004 Diag = _mm_slli_si128(Diag, 2);
2005 Diag = _mm_insert_epi16(Diag, 2*e,0);
2006
2007 R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag);
2008 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1);
2009
2010 }
2011
2012 else
2013 {
2014 Diag = _mm_insert_epi16(Diag,2*e,0);
2015 for(j=i/2-1;j>=-i/2;j--)
2016 {
2017 Diag = _mm_slli_si128(Diag, 2);
2018 Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0);
2019 }
2020 Diag = _mm_slli_si128(Diag, 2);
2021 Diag = _mm_insert_epi16(Diag, 2*e,0);
2022
2023 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
2024 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
2025
2026 }
2027 }
2028 Error = _mm_xor_si128(Error, Error);
2029 Side2 = _mm_xor_si128(Side2, Side2);
2030 Side1 = _mm_xor_si128(Side1, Side1);
2031 Down2 = _mm_xor_si128(Down2, Down2);
2032 Down1 = _mm_xor_si128(Down1, Down1);
2033
2034
2035 Error = _mm_insert_epi16(Error,e,0);
2036 Side2 = _mm_insert_epi16(Side2,2*e,0);
2037 Side1 = _mm_insert_epi16(Side2,2*e,0);
2038 Down1 = _mm_insert_epi16(Down1,2*e,0);
2039
2040
2041 for(j=0; j < e; j++)
2042 {
2043 Side2 = _mm_slli_si128(Side2, 2);
2044 Side2 = _mm_insert_epi16(Side2,1,0);
2045
2046 Side1 = _mm_slli_si128(Side1, 2);
2047 Side1 = _mm_insert_epi16(Side1,1,0);
2048
2049 Down1 = _mm_slli_si128(Down1, 2);
2050 Down1 = _mm_insert_epi16(Down1,1,0);
2051
2052 Down2 = _mm_slli_si128(Down2, 2);
2053 Down2 = _mm_insert_epi16(Down2,1,0);
2054
2055 Error = _mm_slli_si128(Error, 2);
2056 Error = _mm_insert_epi16(Error, e, 0);
2057 }
2058
2059 Down2= _mm_slli_si128(Down2, 2);
2060 Down2 = _mm_insert_epi16(Down2,2*e,0);
2061
2062 for(; i <= 2*lenb-(e-1);i++)
2063 {
2064 flag = 0;
2065 Diag = _mm_xor_si128(Diag, Diag);
2066 if( i%2 == 0)
2067 {
2068 for(j=e/2;j>=-e/2;j--)
2069 {
2070 Diag = _mm_slli_si128(Diag, 2);
2071 Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0);
2072 }
2073
2074
2075 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
2076 R0 = _mm_min_epi16(R0, R1+Down1);
2077
2078 if(_mm_extract_epi16(R0,0) <= e)
2079 flag = 1;
2080
2081 tmp = _mm_srli_si128(R0,2);
2082 for(j=0; j < e-1;j++)
2083 {
2084 if(_mm_extract_epi16(tmp,0) <= e)
2085 flag = 1;
2086 tmp = _mm_srli_si128(tmp,2);
2087 }
2088 // printf("#%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2));
2089 if(flag == 0)
2090 return -1;
2091
2092 if(i == 2*lenb-(e-1))
2093 {
2094 tmp = _mm_srli_si128(R0,2);
2095 for(k=0; k < e-2;k++)
2096 tmp = _mm_srli_si128(tmp,2);
2097 minError = _mm_extract_epi16(tmp,0);
2098 }
2099
2100 }
2101
2102 else
2103 {
2104 for(j=e/2;j>=-e/2-1;j--)
2105 {
2106 Diag = _mm_slli_si128(Diag, 2);
2107 Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i)/2-j-1],0);
2108 }
2109
2110 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
2111 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
2112
2113 //printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2),
2114 // _mm_extract_epi16(R1,3));
2115
2116 if(i >= 2*lenb-e)
2117 {
2118 tmp = _mm_srli_si128(R1,2);
2119 for(k=0; k < e-1;k++)
2120 tmp = _mm_srli_si128(tmp,2);
2121 minError = min(minError, _mm_extract_epi16(tmp,0));
2122 }
2123 }
2124 }
2125
2126 //first cell
2127 Diag = _mm_xor_si128(Diag,Diag);
2128 Diag = _mm_insert_epi16(Diag, b[lenb-3] != a[lena], 0);
2129 Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena-1], 1);
2130 Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-2], 2);
2131 Diag = _mm_insert_epi16(Diag, 2*e, 3);
2132 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
2133 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
2134
2135
2136 minError = min(minError, _mm_extract_epi16(R1,2));
2137
2138 //second cell
2139 Diag = _mm_xor_si128(Diag,Diag);
2140 Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena], 0);
2141 Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-1], 1);
2142 Diag = _mm_insert_epi16(Diag, 2*e, 2);
2143
2144 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
2145 R0 = _mm_min_epi16(R0, R1+Down1);
2146
2147
2148 minError = min(minError, _mm_extract_epi16(R0,1));
2149
2150 //third cell
2151 Diag = _mm_xor_si128(Diag,Diag);
2152 Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena+1], 0);
2153 Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena], 1);
2154 Diag = _mm_insert_epi16(Diag, 2*e, 2);
2155
2156 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
2157 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
2158
2159
2160 minError = min(minError, _mm_extract_epi16(R1,1));
2161
2162 //forth
2163 Diag = _mm_xor_si128(Diag,Diag);
2164 Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+1], 0);
2165 Diag = _mm_insert_epi16(Diag, 2*e, 1);
2166
2167 R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag);
2168 R0 = _mm_min_epi16(R0, R1+Down1);
2169
2170 minError = min(minError, _mm_extract_epi16(R0,0));
2171
2172 //fifth
2173 Diag = _mm_xor_si128(Diag,Diag);
2174 Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+2], 0);
2175 Diag = _mm_insert_epi16(Diag, 2*e, 1);
2176
2177 R1 = _mm_min_epi16(R0+Side2, R1+Diag);
2178 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2);
2179
2180
2181 minError = min(minError, _mm_extract_epi16(R1,0));
2182
2183 if(minError > e)
2184 return -1;
2185 return minError;
2186
2187 }
2188
2189 int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb)
2190 {
2191 if(lenb == 0 || lena == 0)
2192 return 0;
2193
2194 int i = 0;
2195 int j = 0;
2196 int k = 0;
2197
2198 int e = errThreshold;
2199
2200 int minError = 2*e;
2201
2202 char flag = 0;
2203
2204 if(lenb <= e)
2205 {
2206 return smallEditDistanceF(a,lena,b,lenb);
2207 }
2208
2209
2210 __m128i R0, R1;
2211 __m128i Diag;
2212 __m128i Side1, Side2;
2213 __m128i Down1, Down2;
2214 __m128i Error;
2215 __m128i tmp;
2216
2217 /* initialize */
2218 R0 = _mm_setzero_si128 ();
2219 R1 = _mm_setzero_si128 ();
2220 Diag = _mm_setzero_si128 ();
2221 Side1 = _mm_setzero_si128 ();
2222 Side2 = _mm_setzero_si128 ();
2223 Down1 = _mm_setzero_si128 ();
2224 Down2 = _mm_setzero_si128 ();
2225 Error = _mm_setzero_si128 ();
2226 tmp = _mm_setzero_si128 ();
2227 /* end initialize */
2228
2229 R1 = _mm_xor_si128(R1, R1);
2230 R0 = _mm_xor_si128(R0, R0);
2231
2232 Diag = _mm_xor_si128(Diag, Diag);
2233 Side1 = _mm_xor_si128(Side1, Side1);
2234 Down1 = _mm_xor_si128(Down1, Down1);
2235
2236 Diag = _mm_insert_epi16(Diag,2*e,0);
2237
2238 Side1 = _mm_insert_epi16(Side1,1,0);
2239 Side1 = _mm_insert_epi16(Side1,2*e,1);
2240
2241 Down1 = _mm_insert_epi16(Down1,2*e,0);
2242 Down1 = _mm_insert_epi16(Down1,1,1);
2243 Down1 = _mm_insert_epi16(Down1,2*e,2);
2244
2245 R0 = _mm_insert_epi16(R0,0,0);
2246
2247 R1 = _mm_insert_epi16(R1,1,0);
2248 R1 = _mm_insert_epi16(R1,1,1);
2249
2250 for(i=2; i <= e; i++)
2251 {
2252 //set side
2253 Side1 = _mm_slli_si128(Side1,2);
2254 Side1 = _mm_insert_epi16(Side1,1,0);
2255
2256 Down1 = _mm_insert_epi16(Down1,1,0);
2257 Down1 = _mm_slli_si128(Down1,2);
2258 Down1 = _mm_insert_epi16(Down1,2*e,0);
2259
2260 Diag = _mm_xor_si128(Diag, Diag);
2261 if( i%2 == 0)
2262 {
2263 Diag = _mm_insert_epi16(Diag,2*e,0);
2264
2265 for(j=1;j<=i-1;j++)
2266 {
2267 Diag = _mm_slli_si128(Diag, 2);
2268 Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0);
2269 }
2270 Diag = _mm_slli_si128(Diag, 2);
2271 Diag = _mm_insert_epi16(Diag, 2*e,0);
2272
2273 R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag);
2274 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1);
2275 }
2276
2277 else
2278 {
2279 Diag = _mm_insert_epi16(Diag,2*e,0);
2280 for(j=i/2-1;j>=-i/2;j--)
2281 {
2282 Diag = _mm_slli_si128(Diag, 2);
2283 Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0);
2284 }
2285 Diag = _mm_slli_si128(Diag, 2);
2286 Diag = _mm_insert_epi16(Diag, 2*e,0);
2287
2288 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
2289 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
2290 }
2291 }
2292 Error = _mm_xor_si128(Error, Error);
2293 Side2 = _mm_xor_si128(Side2, Side2);
2294 Down2 = _mm_xor_si128(Down2, Down2);
2295 Down1 = _mm_xor_si128(Down1, Down1);
2296
2297 Error = _mm_insert_epi16(Error,e,0);
2298 Side2 = _mm_insert_epi16(Side2,2*e,0);
2299 Down1 = _mm_insert_epi16(Down1,2*e,0);
2300
2301
2302 for(j=0; j < e; j++)
2303 {
2304 Side2 = _mm_slli_si128(Side2, 2);
2305 Side2 = _mm_insert_epi16(Side2,1,0);
2306
2307 Down1 = _mm_slli_si128(Down1, 2);
2308 Down1 = _mm_insert_epi16(Down1,1,0);
2309
2310 Down2 = _mm_slli_si128(Down2, 2);
2311 Down2 = _mm_insert_epi16(Down2,1,0);
2312
2313 Error = _mm_slli_si128(Error, 2);
2314 Error = _mm_insert_epi16(Error, e, 0);
2315 }
2316
2317 Down2= _mm_slli_si128(Down2, 2);
2318 Down2 = _mm_insert_epi16(Down2,2*e,0);
2319
2320 for(; i <= 2*lenb-(e-1);i++)
2321 {
2322 flag = 0;
2323 Diag = _mm_xor_si128(Diag, Diag);
2324 if( i%2 == 0)
2325 {
2326 for(j=e/2;j>=-e/2;j--)
2327 {
2328 Diag = _mm_slli_si128(Diag, 2);
2329 Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0);
2330 }
2331
2332 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
2333 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
2334
2335
2336 if(_mm_extract_epi16(R0,0) <= e)
2337 flag = 1;
2338
2339 tmp = _mm_srli_si128(R0,2);
2340 for(j=0; j < e-1;j++)
2341 {
2342 if(_mm_extract_epi16(tmp,0) <= e)
2343 flag = 1;
2344 tmp = _mm_srli_si128(tmp,2);
2345 }
2346
2347
2348 if(flag == 0)
2349 return -1;
2350
2351 if(i == 2*lenb-e)
2352 {
2353 tmp = _mm_srli_si128(R0,2);
2354 for(k=0; k < e-1;k++)
2355 tmp = _mm_srli_si128(tmp,2);
2356 minError = _mm_extract_epi16(tmp,0);
2357 }
2358
2359 }
2360
2361 else
2362 {
2363 for(j=-e/2+1;j<=e/2;j++)
2364 {
2365 Diag = _mm_slli_si128(Diag, 2);
2366 Diag = _mm_insert_epi16(Diag, b[(i+1)/2-j-1] != a[(i-1)/2+j-1],0);
2367 }
2368
2369 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
2370 R1 = _mm_min_epi16(R1, R0+Down1);
2371
2372 if(i >= 2*lenb-e)
2373 {
2374 tmp = _mm_srli_si128(R1,2);
2375 for(k=0; k < e-2;k++)
2376 tmp = _mm_srli_si128(tmp,2);
2377 minError = min(minError, _mm_extract_epi16(tmp,0));
2378 }
2379 }
2380 }
2381
2382 j=0;
2383 int tmpE = e;
2384 for(;j<2*(e-2)+1;j++)
2385 {
2386
2387 Diag = _mm_xor_si128(Diag, Diag);
2388 //set the first element
2389 if(j==0)
2390 {
2391 for( k=0;k<=e-1;k++ )
2392 {
2393 Diag = _mm_slli_si128(Diag, 2);
2394 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
2395 }
2396
2397 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
2398 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
2399
2400 tmpE--;
2401
2402 tmp = _mm_srli_si128(R0,2);
2403 for(k=0; k < e-2;k++)
2404 tmp = _mm_srli_si128(tmp,2);
2405 minError = min(minError, _mm_extract_epi16(tmp,0));
2406 }
2407 else if(j%2 == 0)
2408 {
2409 for(k=0;k<tmpE;k++)
2410 {
2411 Diag = _mm_slli_si128(Diag, 2);
2412 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
2413 }
2414
2415 R0 = _mm_min_epi16(R1+Side2, R0+Diag);
2416 R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2);
2417
2418 tmpE--;
2419
2420 tmp = _mm_srli_si128(R0,2);
2421 for(k=0; k < tmpE-1;k++)
2422 tmp = _mm_srli_si128(tmp,2);
2423 minError = min(minError, _mm_extract_epi16(tmp,0));
2424 }
2425
2426
2427 else
2428 {
2429 for(k=0;k<tmpE;k++)
2430 {
2431 Diag = _mm_slli_si128(Diag, 2);
2432 Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0);
2433 }
2434
2435 R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag);
2436 R1 = _mm_min_epi16(R1, R0+Down1);
2437
2438 tmp = _mm_srli_si128(R1,2);
2439 for(k=0; k < tmpE-1;k++)
2440 tmp = _mm_srli_si128(tmp,2);
2441 minError = min(minError, _mm_extract_epi16(tmp,0));
2442 }
2443 i++;
2444 }
2445 //Diag
2446
2447 Diag = _mm_xor_si128(Diag,Diag);
2448 Diag = _mm_insert_epi16(Diag, 2*e, 0);
2449 Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1);
2450
2451 Side1 = _mm_insert_epi16(Side1,1,0);
2452 Side1 = _mm_insert_epi16(Side1,1,1);
2453
2454 Down1 = _mm_insert_epi16(Down1, 2*e, 0);
2455 Down1 = _mm_insert_epi16(Down1, 1, 1);
2456
2457 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
2458 R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1);
2459
2460 minError = min(minError, _mm_extract_epi16(R1,1));
2461
2462 Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 1);
2463 Down1 = _mm_insert_epi16(Down1, 1, 0);
2464
2465 R0 = _mm_min_epi16(R1+Down1,R0+Diag);
2466 R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1);
2467
2468 minError = min(minError, _mm_extract_epi16(R0,0));
2469
2470 if(minError > e)
2471 return -1;
2472 return minError;
2473 }
2474
2475
2476 int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb)
2477 {
2478 if(lenb == 0 || lena == 0)
2479 return 0;
2480
2481
2482
2483 int i0 = 0;
2484 int i1 = 0;
2485
2486
2487 int error; //0: if the two character are equal 1: if not
2488
2489 int i = 0; //loop index
2490
2491 int e = 2; //error bound
2492
2493 int totalError = 0;
2494
2495 __m128i R0;
2496 __m128i R1;
2497
2498 __m128i Side1, Side2,Side; //side matrix
2499 __m128i Down1, Down2,Down; //down matrix
2500 __m128i Diag;
2501
2502 __m128i tmp;
2503 __m128i ERROR_REACH;
2504
2505 /* initialize */
2506 R0 = _mm_setzero_si128 ();
2507 R1 = _mm_setzero_si128 ();
2508 Diag = _mm_setzero_si128 ();
2509 Side1 = _mm_setzero_si128 ();
2510 Side2 = _mm_setzero_si128 ();
2511 Down1 = _mm_setzero_si128 ();
2512 Down2 = _mm_setzero_si128 ();
2513 Side = _mm_setzero_si128 ();
2514 Down = _mm_setzero_si128 ();
2515 tmp = _mm_setzero_si128 ();
2516 ERROR_REACH = _mm_setzero_si128 ();
2517 /* end initialize */
2518
2519
2520 if(lenb <= e)
2521 {
2522 return smallEditDistanceF(a,lena,b,lenb);
2523 }
2524
2525 ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e);
2526
2527 R0 = _mm_insert_epi16(R0,0,0);
2528
2529 R1 = _mm_insert_epi16(R1,1,0);
2530 R1 = _mm_insert_epi16(R1,1,1);
2531
2532 // error = ((a[0]) != (b[0]));
2533
2534 Diag = _mm_set_epi16(0,0,0,0,0,2*e,((a[0]) != (b[0])),2*e);
2535 Side1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1);
2536 Side2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e);
2537 Down1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1);
2538 Down2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e);
2539
2540 tmp = _mm_slli_si128(R1,2);
2541
2542 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2543 R0 = _mm_min_epi16(R0,tmp+Down2);
2544
2545 for (i = 3; i < 2*lena; i++)
2546 {
2547 if(i % 2 ==1)
2548 {
2549
2550 Diag = _mm_xor_si128(Diag, Diag);
2551 error = ((a[(i+1)/2-1]) != (b[(i-1)/2-1]));
2552 Diag = _mm_insert_epi16(Diag,error,0);
2553 error = ((a[(i-1)/2-1]) != (b[(i+1)/2-1]));
2554 Diag = _mm_insert_epi16(Diag,error,1);
2555 // Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0, ((a[(i-1)/2-1]) != (b[(i+1)/2-1])) ,((a[(i+1)/2-1]) != (b[(i-1)/2-1])));
2556
2557
2558 tmp = _mm_srli_si128(R0,2);
2559
2560 R1 = _mm_min_epi16(tmp+Side1, R1+Diag);
2561 R1 = _mm_min_epi16(R1,R0+Down1);
2562
2563 if(i > 2 * lenb - 2)
2564 {
2565 i1 = _mm_extract_epi16(R1, 1);
2566 totalError = min(totalError, i1);
2567 }
2568 }
2569
2570 else if(i % 2 == 0)
2571 {
2572 error = ((a[i/2]) != (b[i/2-2]));
2573 Diag = _mm_insert_epi16(Diag,error,0);
2574 error = ((a[i/2-1]) != (b[i/2-1]));
2575 Diag = _mm_insert_epi16(Diag,error,1);
2576 error = ((a[i/2-2]) != (b[i/2]));
2577 Diag = _mm_insert_epi16(Diag,error,2);
2578
2579 // Diag = _mm_set_epi16(0, 0, 0, 0, 0, ((a[i/2-2]) != (b[i/2])) , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) );
2580
2581 tmp = _mm_slli_si128(R1,2);
2582
2583 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2584 R0 = _mm_min_epi16(R0,tmp+Down2);
2585
2586 tmp = _mm_sub_epi16(ERROR_REACH, R0);
2587 i0 = _mm_movemask_epi8(tmp);
2588
2589 if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2)
2590 return -1;
2591 if(i == 2 * lenb - 2) {
2592 totalError = _mm_extract_epi16(R0, 2);
2593 }
2594 }
2595 }
2596
2597 Down1 = _mm_insert_epi16(Down1,2*e,0);
2598
2599 //fill the first part of the error
2600 error = ((a[i/2]) != (b[i/2-2]));
2601 Diag = _mm_insert_epi16(Diag,error,0);
2602 error = ((a[i/2-1]) != (b[i/2-1]));
2603 Diag = _mm_insert_epi16(Diag,error,1);
2604 Diag = _mm_insert_epi16(Diag,2*e,2);
2605 // Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) );
2606
2607 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2608 R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1);
2609
2610 // i0 = _mm_extract_epi16(R0, 0);
2611 i1 = _mm_extract_epi16(R0, 1);
2612
2613 totalError = min(totalError, i1);
2614
2615 //fill the second part of the error
2616 i++;
2617
2618 Diag = _mm_xor_si128(Diag, Diag);
2619 Diag = _mm_insert_epi16(Diag,2*e,0);
2620 error = ((a[i/2]) != (b[lenb-1]));
2621 Diag = _mm_insert_epi16(Diag,error,1);
2622 Diag = _mm_insert_epi16(Diag,2*e,2);
2623 // Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2]) != (b[lenb-1])) , 2*e );
2624
2625
2626 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
2627 R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1);
2628
2629 // i0 = _mm_extract_epi16(R1, 0);
2630 i1 = _mm_extract_epi16(R1, 1);
2631
2632 totalError = min(totalError, i1);
2633 //fill the last the last element of the matrix
2634 i++;
2635
2636 Diag = _mm_xor_si128(Diag, Diag);
2637 error = ((a[i/2]) != (b[lenb-1]));
2638 Diag = _mm_insert_epi16(Diag,error,0);
2639
2640 // Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0 , 0 , ((a[i/2]) != (b[lenb-1])) );
2641
2642
2643 Down = _mm_insert_epi16(Down,1,0);
2644
2645 Side = _mm_insert_epi16(Side,1,0);
2646
2647 tmp = _mm_srli_si128(R1,2);
2648
2649 R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag);
2650 R0 = _mm_min_epi16(R0,tmp+Side);
2651
2652 i0 = _mm_extract_epi16(R0, 0);
2653
2654 totalError = min(totalError, i0);
2655
2656 if(totalError > e)
2657 return -1;
2658
2659 return totalError;
2660
2661 }
2662
2663 int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb)
2664 {
2665 if(lenb == 0 || lena == 0)
2666 return 0;
2667
2668 int i0 = 0;
2669 int i1 = 0;
2670
2671 int error; //0: if the two character are equal 1: if not
2672
2673 int i = 0; //loop index
2674
2675 int e = 2; //error bound
2676
2677 int totalError = 0;
2678
2679 __m128i R0;
2680 __m128i R1;
2681
2682 __m128i Side1, Side2,Side; //side matrix
2683 __m128i Down1, Down2,Down; //down matrix
2684 __m128i Diag; //diag matrix
2685
2686 __m128i tmp;
2687 __m128i ERROR_REACH;
2688
2689 /* initialize */
2690 R0 = _mm_setzero_si128 ();
2691 R1 = _mm_setzero_si128 ();
2692 Diag = _mm_setzero_si128 ();
2693 Side1 = _mm_setzero_si128 ();
2694 Side2 = _mm_setzero_si128 ();
2695 Side = _mm_setzero_si128 ();
2696 Down1 = _mm_setzero_si128 ();
2697 Down2 = _mm_setzero_si128 ();
2698 Down = _mm_setzero_si128 ();
2699 ERROR_REACH = _mm_setzero_si128 ();
2700 tmp = _mm_setzero_si128 ();
2701 /* end initialize */
2702
2703 if(lenb <= e)
2704 {
2705 return smallEditDistanceB(a,lena,b,lenb);
2706 }
2707
2708
2709 ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e);
2710
2711 R0 = _mm_insert_epi16(R0,0,0);
2712
2713 R1 = _mm_insert_epi16(R1,1,0);
2714 R1 = _mm_insert_epi16(R1,1,1);
2715
2716 error = ((a[0]) != (b[0]));
2717
2718 Diag = _mm_insert_epi16(Diag,2*e,0);
2719 Diag = _mm_insert_epi16(Diag,error,1);
2720 Diag = _mm_insert_epi16(Diag,2*e,2);
2721
2722 Side1 = _mm_insert_epi16(Side1,1,0);
2723 Side1 = _mm_insert_epi16(Side1,1,1);
2724 Side1 = _mm_insert_epi16(Side1,2*e,2);
2725
2726 Side2 = _mm_insert_epi16(Side2,2*e,0);
2727 Side2 = _mm_insert_epi16(Side2,1,1);
2728 Side2 = _mm_insert_epi16(Side2,1,2);
2729
2730 Down1 = _mm_insert_epi16(Down1,1,0);
2731 Down1 = _mm_insert_epi16(Down1,1,1);
2732 Down1 = _mm_insert_epi16(Down1,2*e,2);
2733
2734 Down2 = _mm_insert_epi16(Down2,2*e,0);
2735 Down2 = _mm_insert_epi16(Down2,1,1);
2736 Down2 = _mm_insert_epi16(Down2,1,2);
2737
2738 tmp = _mm_slli_si128(R1,2);
2739
2740 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2741 R0 = _mm_min_epi16(R0,tmp+Down2);
2742
2743 // printf("%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2));
2744 for (i = 3; i < 2*lena; i++)
2745 {
2746 if(i % 2 ==1)
2747 {
2748 Diag = _mm_sub_epi8(Diag, Diag);
2749 error = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) );
2750 Diag = _mm_insert_epi16(Diag,error,0);
2751 error = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) );
2752 Diag = _mm_insert_epi16(Diag,error,1);
2753 //printf("#%d #%d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1));
2754 tmp = _mm_srli_si128(R0,2);
2755
2756 R1 = _mm_min_epi16(tmp+Side1, R1+Diag);
2757 R1 = _mm_min_epi16(R1,R0+Down1);
2758
2759 if(i > 2 * lenb - 2) {
2760 i1 = _mm_extract_epi16(R1, 1);
2761 totalError = min(totalError, i1);
2762 }
2763 // printf("%d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1));
2764 }
2765
2766 else if(i % 2 == 0)
2767 {
2768 error = ( *(a-(i/2)) != *(b-(i/2-2)) );
2769 Diag = _mm_insert_epi16(Diag,error,0);
2770 error = ( *(a-(i/2-1)) != *(b-(i/2-1)) );
2771 Diag = _mm_insert_epi16(Diag,error,1);
2772 error = ( *(a-(i/2-2)) != *(b-(i/2)));
2773 Diag = _mm_insert_epi16(Diag,error,2);
2774
2775 tmp = _mm_slli_si128(R1,2);
2776
2777 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2778 R0 = _mm_min_epi16(R0,tmp+Down2);
2779
2780 tmp = _mm_sub_epi16(ERROR_REACH, R0);
2781 i0 = _mm_movemask_epi8(tmp);
2782
2783 if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2)
2784 return -1;
2785
2786 if(i == 2 * lenb - 2) {
2787 totalError = _mm_extract_epi16(R0, 2);
2788 }
2789 }
2790 }
2791 Down1 = _mm_insert_epi16(Down1,2*e,0);
2792
2793 //fill the first part of the error
2794 error = ( *(a-(i/2)) != *(b-(i/2-2)) );
2795 Diag = _mm_insert_epi16(Diag,error,0);
2796 error = ( *(a-(i/2-1)) != *(b-(i/2-1)) );
2797 Diag = _mm_insert_epi16(Diag,error,1);
2798 Diag = _mm_insert_epi16(Diag,2*e,2);
2799
2800 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
2801 R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1);
2802
2803 i0 = _mm_extract_epi16(R0, 0);
2804 i1 = _mm_extract_epi16(R0, 1);
2805
2806 totalError = min(totalError, i1);
2807
2808 //fill the second part of the error
2809 i++;
2810 Diag = _mm_sub_epi8(Diag, Diag);
2811 Diag = _mm_insert_epi16(Diag,2*e,0);
2812 error = ( *(a-(i/2)) != *(b-(lenb-1)) );
2813 Diag = _mm_insert_epi16(Diag,error,1);
2814 Diag = _mm_insert_epi16(Diag,2*e,2);
2815
2816 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
2817 R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1);
2818
2819 i0 = _mm_extract_epi16(R1, 0);
2820 i1 = _mm_extract_epi16(R1, 1);
2821
2822 totalError = min(totalError, i1);
2823
2824 //fill the last the last element of the matrix
2825 i++;
2826 Diag = _mm_sub_epi8(Diag, Diag);
2827 error = ( *(a-(i/2)) != *(b-(lenb-1)) );
2828 Diag = _mm_insert_epi16(Diag,error,0);
2829
2830 Down = _mm_insert_epi16(Down,1,0);
2831
2832 Side = _mm_insert_epi16(Side,1,0);
2833
2834 tmp = _mm_srli_si128(R1,2);
2835
2836 R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag);
2837 R0 = _mm_min_epi16(R0,tmp+Side);
2838
2839 i0 = _mm_extract_epi16(R0, 0);
2840
2841 totalError = min(totalError, i0);
2842
2843 if(totalError > e || totalError == 0)
2844 return -1;
2845 return totalError;
2846 }
2847
2848 void initBestMapping(int totalReadNumber)
2849 {
2850 int i = 0;
2851 bestHitMappingInfo = getMem(totalReadNumber * sizeof(BestFullMappingInfo));
2852 for(i = 0; i < totalReadNumber; i++) {
2853 bestHitMappingInfo[i].loc = -1;
2854 }
2855 }
2856
2857
2858 void finalizeBestSingleMapping()
2859 {
2860 int i = 0;
2861 char *_tmpQual, *_tmpSeq;
2862 char rqual[SEQ_LENGTH + 1];
2863 rqual[SEQ_LENGTH]='\0';
2864
2865 for(i = 0; i < _msf_seqListSize; i++)
2866 {
2867 if(_msf_seqList[i].hits[0] != 0)
2868 {
2869 if (bestHitMappingInfo[i].dir)
2870 {
2871 reverse(_msf_seqList[i].qual, rqual, SEQ_LENGTH);
2872 _tmpQual = rqual;
2873 _tmpSeq = _msf_seqList[i].rseq;
2874 }
2875 else
2876 {
2877 _tmpQual = _msf_seqList[i].qual;
2878 _tmpSeq = _msf_seqList[i].seq;
2879 }
2880
2881
2882 _msf_output.QNAME = _msf_seqList[i].name;
2883 _msf_output.FLAG = 16 * bestHitMappingInfo[i].dir;
2884 _msf_output.RNAME = bestHitMappingInfo[i].chr;
2885
2886 _msf_output.POS = bestHitMappingInfo[i].loc;
2887 _msf_output.MAPQ = 255;
2888 _msf_output.CIGAR = bestHitMappingInfo[i].cigar ;
2889 _msf_output.MRNAME = "*";
2890 _msf_output.MPOS = 0;
2891 _msf_output.ISIZE = 0;
2892
2893
2894 _msf_output.SEQ = _tmpSeq;
2895 _msf_output.QUAL = _tmpQual;
2896
2897 _msf_output.optSize = 2;
2898 _msf_output.optFields = _msf_optionalFields;
2899
2900 _msf_optionalFields[0].tag = "NM";
2901 _msf_optionalFields[0].type = 'i';
2902 _msf_optionalFields[0].iVal = bestHitMappingInfo[i].err;
2903
2904 _msf_optionalFields[1].tag = "MD";
2905 _msf_optionalFields[1].type = 'Z';
2906 _msf_optionalFields[1].sVal = bestHitMappingInfo[i].md;
2907
2908 output(_msf_output);
2909 }
2910 }
2911 freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo));
2912 }
2913 /**********************************************/
2914 int compare (const void *a, const void *b)
2915 {
2916 return ((Pair *)a)->hv - ((Pair *)b)->hv;
2917 /*char *s1 = ((Pair *)a)->hv;
2918 char *s2 = ((Pair *)b)->hv;
2919 int i = 0;
2920
2921 int diff = 0;
2922 int sign = 0;
2923
2924 for(i = 0; i < SEQ_LENGTH; i++)
2925 {
2926 diff += (s1[i] != s2[i]);
2927 if(s1[i] > s2[i])
2928 sign++;
2929 else if(s1[i] < s2[i])
2930 sign--;
2931 }
2932
2933 return diff*sign;*/
2934 // return strncmp(s1, s2,SEQ_LENGTH);
2935
2936 }
2937 /**********************************************/
2938 void preProcessReads()
2939 {
2940 int i = 0;
2941
2942 _msf_sort_seqList = getMem(_msf_seqListSize * sizeof(Pair));
2943 for(i = 0; i < _msf_seqListSize; i++)
2944 {
2945 _msf_sort_seqList[i].hv = hashVal(_msf_seqList[i].seq);
2946
2947 _msf_sort_seqList[i].readNumber = i;
2948 }
2949
2950 qsort(_msf_sort_seqList, _msf_seqListSize, sizeof(Pair), compare);
2951
2952 /*
2953 for(i = 0; i < _msf_seqListSize; i++)
2954 {
2955 //printf("%s\n", _msf_sort_seqList[i].hv);
2956 }
2957 */
2958
2959 _msf_map_sort_seqList = getMem(_msf_seqListSize * sizeof(int));
2960
2961 for(i = 0; i < _msf_seqListSize; i++)
2962 _msf_map_sort_seqList[_msf_seqList[i].readNumber] = i;
2963
2964 }
2965 /**********************************************/
2966
2967 int verifySingleEnd(int index, char* seq, int offset)
2968 {
2969 int curOff = 0;
2970 int i;
2971
2972 char *ref;
2973
2974 int err;
2975 int errCnt =0;
2976 int errCntOff = 0;
2977 int NCntOff = 0;
2978
2979 ref = _msf_refGen + index - 1;
2980
2981 verificationCnt++;
2982
2983 for (i = 0; i < SEQ_LENGTH; i++)
2984 {
2985 err = *ref != *seq;
2986 errCnt += err;
2987 if (errCnt > errThreshold)
2988 {
2989
2990 return -1;
2991 }
2992
2993 if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff])
2994 {
2995 errCntOff += err;
2996 NCntOff += (*seq == 'N');
2997 }
2998 else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1])
2999 {
3000
3001 if (errCntOff == 0 && NCntOff == 0 && offset > curOff)
3002 {
3003 return -1;
3004 }
3005
3006 errCntOff = 0;
3007 NCntOff = 0;
3008 curOff++;
3009
3010 if ( i >= _msf_samplingLocs[curOff])
3011 {
3012 errCntOff += err;
3013 NCntOff += (*seq == 'N');
3014 }
3015 }
3016
3017 ref++;
3018 seq++;
3019 }
3020 return errCnt;
3021 }
3022
3023 /*********************************************/
3024 void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName)
3025 {
3026 int i;
3027
3028 if (_msf_optionalFields == NULL)
3029 {
3030 _msf_op = getMem(SEQ_LENGTH);
3031 if (pairedEndMode)
3032 {
3033 _msf_optionalFields = getMem(8*sizeof(OPT_FIELDS));
3034 }
3035 else
3036 {
3037 _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS));
3038 }
3039
3040 for (i=0; i<200;i++)
3041 {
3042 sprintf(_msf_numbers[i],"%d%c",i, '\0');
3043 }
3044 sprintf(_msf_cigar, "%dM", SEQ_LENGTH);
3045 }
3046
3047 if (_msf_samplingLocsEnds == NULL)
3048 {
3049 _msf_samplingLocs = samplingLocs;
3050 _msf_samplingLocsSize = samplingLocsSize;
3051
3052 _msf_samplingLocsEnds = getMem(sizeof(int)*_msf_samplingLocsSize);
3053 for (i=0; i<_msf_samplingLocsSize; i++)
3054 {
3055 _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1;
3056 }
3057
3058 _msf_seqList = seqList;
3059 _msf_seqListSize = seqListSize;
3060
3061 preProcessReads();
3062
3063 _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int));
3064 for(i = 0; i < _msf_seqListSize; i++)
3065 {
3066 _msf_oeaMapping[i] = 0;
3067 }
3068
3069 _msf_discordantMapping = getMem(_msf_seqListSize * sizeof(int));
3070 for(i = 0; i < _msf_seqListSize; i++)
3071 {
3072 _msf_discordantMapping[i] = 0;
3073 }
3074
3075 }
3076
3077 if (_msf_refGenName == NULL)
3078 {
3079 _msf_refGenName = getMem(4*SEQ_LENGTH);
3080 }
3081 _msf_refGen = getRefGenome();
3082 _msf_refGenLength = strlen(_msf_refGen);
3083
3084 _msf_refGenOffset = getRefGenomeOffset();
3085 snprintf(_msf_refGenName, 4*SEQ_LENGTH,"%s%c", getRefGenomeName(), '\0');
3086 _msf_refGenName[strlen(getRefGenomeName())] = '\0';
3087
3088
3089 if (_msf_verifiedLocs != NULL){
3090 freeMem(_msf_verifiedLocs, sizeof(int) * (_msf_refGenLength+1));
3091 }
3092
3093 _msf_verifiedLocs = (int *) getMem(sizeof(int)*(_msf_refGenLength+1));
3094
3095 for (i=0; i<=_msf_refGenLength; i++)
3096 _msf_verifiedLocs[i] = _msf_seqListSize*10+1;
3097
3098
3099
3100 if (pairedEndMode && _msf_seqHits == NULL)
3101 {
3102
3103 _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo));
3104
3105 for (i=0; i<seqListSize; i++)
3106 {
3107 //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations));
3108 _msf_mappingInfo[i].next = NULL;
3109 _msf_mappingInfo[i].size = 0;
3110 }
3111
3112 _msf_seqHits = getMem((_msf_seqListSize) * sizeof(int));
3113
3114
3115 for (i=0; i<_msf_seqListSize; i++)
3116 {
3117 _msf_seqHits[i] = 0;
3118 }
3119
3120 _msf_readHasConcordantMapping = getMem(_msf_seqListSize / 2 * sizeof(char));
3121 for(i = 0; i < _msf_seqListSize/2; i++)
3122 {
3123 _msf_readHasConcordantMapping[i] = 0;
3124 }
3125
3126 initLoadingRefGenome(genFileName);
3127 }
3128
3129 if (_msf_refGenOffset == 0)
3130 {
3131 _msf_refGenBeg = 1;
3132 }
3133 else
3134 {
3135 _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2;
3136 }
3137 _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1;
3138
3139
3140 }
3141 /**********************************************/
3142 void finalizeFAST()
3143 {
3144 freeMem(_msf_seqHits, (_msf_seqListSize) * sizeof(int));
3145 freeMem(_msf_refGenName, 4*SEQ_LENGTH);
3146
3147
3148 /*
3149 int i;
3150 for (i=0; i<_msf_rIndexSize; i++)
3151 {
3152 freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1);
3153 }
3154 freeMem(_msf_rIndex, _msf_rIndexSize);*/
3155
3156
3157 freeMem(_msf_map_sort_seqList, sizeof(Pair) * _msf_seqListSize);
3158 freeMem(_msf_sort_seqList, sizeof(int) * _msf_seqListSize);
3159
3160 }
3161
3162 /*
3163 Will apply the Levenshtein Dynamic programming.
3164 Different from verifySingleEndEditDistance fucntion
3165 as in this fucntion only one dynamic table is made while
3166 in verifySingleEndEditDistance two dynamic table is made
3167 for each right and left string
3168 */
3169 int editDistance(int refIndex, char *seq, int seqLength, char *matrix)
3170 {
3171 int i = 0;
3172 int size = 0;
3173 int error = 0;
3174 int rIndex = 0;
3175 int directionIndex = 0;
3176
3177 int min = 0;
3178 int minIndex =0;
3179
3180 int tempUp = 0;
3181 int tempDown = 0;
3182
3183 char *ref;
3184
3185 int errorString = 0;
3186 /*
3187 1: Up
3188 2: Side
3189 3: Diagnoal Match
3190 4: Diagnoal Mismatch
3191 */
3192
3193 int upValue;
3194 int diagValue;
3195 int sideValue;
3196
3197 ref = _msf_refGen + refIndex - 1;
3198
3199 rIndex = 1;
3200
3201 for(i=0; i <= errThreshold; i++)
3202 {
3203 score[0][i] = i;
3204 score[i][0] = i;
3205 }
3206
3207 while(rIndex <= seqLength +errThreshold)
3208 {
3209 tempUp = ((rIndex - errThreshold) > 0 ? ((rIndex > seqLength) ? seqLength - errThreshold :rIndex - errThreshold) : 1 );
3210 tempDown = ((rIndex >= seqLength-errThreshold ) ? seqLength+1 :rIndex + errThreshold + 1);
3211 for(i = tempUp ; i < tempDown ; i++)
3212 {
3213 errorString = (*(ref+rIndex-1) == *(seq+i-1));
3214
3215 upValue = score[i-1][rIndex]+1;
3216 diagValue = score[i-1][rIndex-1]+ !errorString;
3217 sideValue = score[i][rIndex-1]+1;
3218
3219 if(i != tempUp && i != tempDown-1)
3220 score[i][rIndex] = min3(sideValue, diagValue , upValue);
3221
3222 else if( (i == ((rIndex - errThreshold) > 0 ? rIndex - errThreshold : 1)) && rIndex <= seqLength )
3223 score[i][rIndex] = min(sideValue, diagValue);
3224 else if(rIndex > seqLength && (i == seqLength - errThreshold) )
3225 score[i][rIndex] = sideValue;
3226 else
3227 score[i][rIndex] = min(diagValue , upValue);
3228
3229 if(i == tempUp)
3230 error = score[i][rIndex];
3231 else if(error > score[i][rIndex])
3232 error = score[i][rIndex];
3233 }
3234 rIndex++;
3235 }
3236
3237 min = score[seqLength][seqLength+errThreshold];
3238 minIndex = seqLength + errThreshold;
3239
3240 // Find the Best error for all the possible ways.
3241 for(i = 1; i <= 2*errThreshold; i++)
3242 {
3243 if(min >= score[seqLength][seqLength+errThreshold-i] && seqLength+errThreshold-i > 0)
3244 {
3245 min = score[seqLength][seqLength+errThreshold-i];
3246 minIndex = seqLength+errThreshold-i;
3247 }
3248 }
3249
3250 error = score[seqLength][minIndex];
3251
3252 directionIndex = seqLength;
3253 rIndex = minIndex;
3254 while(directionIndex != 0 || rIndex != 0)
3255 {
3256
3257 if(rIndex == 0)
3258 {
3259 if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1)
3260 {
3261 matrix[size] = *(seq+directionIndex-1);
3262 size++;
3263 matrix[size] = 'I';
3264 directionIndex--;
3265 }
3266 }
3267 else if(directionIndex == 0)
3268 {
3269 if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1)
3270 {
3271 matrix[size] = *(ref+rIndex-1);
3272 size++;
3273 matrix[size] = 'D';
3274 rIndex--;
3275 }
3276 }
3277 else if(directionIndex-rIndex == errThreshold)
3278 {
3279 if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1)
3280 {
3281 matrix[size] = *(seq+directionIndex-1);
3282 size++;
3283 matrix[size] = 'I';
3284 directionIndex--;
3285 }
3286 else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 )
3287 {
3288 matrix[size] = *(ref+rIndex-1);
3289 rIndex--;
3290 directionIndex--;
3291 }
3292 else
3293 {
3294 matrix[size] = 'M';
3295 rIndex--;
3296 directionIndex--;
3297 }
3298
3299 }
3300 else if(rIndex - directionIndex == errThreshold)
3301 {
3302 if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1)
3303 {
3304 matrix[size] = *(ref+rIndex-1);
3305 size++;
3306 matrix[size] = 'D';
3307 rIndex--;
3308 }
3309 else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 )
3310 {
3311 matrix[size] = *(ref+rIndex-1);
3312 rIndex--;
3313 directionIndex--;
3314 }
3315 else
3316 {
3317 matrix[size] = 'M';
3318 rIndex--;
3319 directionIndex--;
3320 }
3321 }
3322 else
3323 {
3324 if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1 && directionIndex != 0)
3325 {
3326 matrix[size] = *(seq+directionIndex-1);
3327 size++;
3328 matrix[size] = 'I';
3329 directionIndex--;
3330 }
3331 else if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1 && rIndex != 0)
3332 {
3333 matrix[size] = *(ref+rIndex-1);
3334 size++;
3335 matrix[size] = 'D';
3336 rIndex--;
3337 }
3338 else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 )
3339 {
3340 matrix[size] = *(ref+rIndex-1);
3341 rIndex--;
3342 directionIndex--;
3343 }
3344 else
3345 {
3346 matrix[size] = 'M';
3347 rIndex--;
3348 directionIndex--;
3349 }
3350 }
3351 size++;
3352 }
3353
3354 matrix[size] = '\0';
3355
3356 char returnString[200];
3357
3358 returnString[0] = '\0';
3359 reverse(matrix, returnString, size);
3360 sprintf(matrix, "%s", returnString);
3361
3362 return error;
3363 }
3364
3365 /*
3366 Will apply the Levenshtein Dynamic programming.
3367 in both right and left direction as long as the
3368 threshould error is reached or end of string length
3369
3370 */
3371 int msfHashVal(char *seq)
3372 {
3373 int i=0;
3374 int val=0, numericVal=0;
3375
3376 while(i<6)
3377 {
3378 switch (seq[i])
3379 {
3380 case 'A':
3381 numericVal = 0;
3382 break;
3383 case 'C':
3384 numericVal = 1;
3385 break;
3386 case 'G' :
3387 numericVal = 2;
3388 break;
3389 case 'T':
3390 numericVal = 3;
3391 break;
3392 default:
3393 return -1;
3394 break;
3395 }
3396 val = (val << 2)|numericVal;
3397 i++;
3398 }
3399 return val;
3400 }
3401
3402
3403
3404 int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue)
3405 {
3406 int i = 0;
3407
3408 char * ref;
3409 char * tempref;
3410
3411 int rIndex = 0; //reference Index
3412
3413 int e = errThreshold;
3414 int error = 0;
3415 int error1 = 0;
3416 int totalError = 0;
3417
3418
3419 /*
3420 1: Up
3421 2: Side
3422 3: Diagnoal Match
3423 4: Diagnoal Mismatch
3424 */
3425
3426
3427 int minIndex1 = 0;
3428 int minIndex2 = 0;
3429
3430
3431 int directionIndex = 0;
3432
3433 int size = 0;
3434
3435 int startIndex1 = 0;
3436
3437 rIndex = 1;
3438
3439
3440 char matrixR[200];
3441 char matrixL[200];
3442
3443 ref = _msf_refGen + refIndex - 1;
3444 tempref = _msf_refGen + refIndex - 1;
3445
3446 int jumpIndex = 0;
3447
3448 if(rSeqLength != 0)
3449 {
3450 error1 = forwardEditDistance2SSE2(ref+segLength+jumpIndex, rSeqLength-jumpIndex, rSeq+jumpIndex, rSeqLength-jumpIndex);
3451 if(error1 == -1)
3452 return -1;
3453 }
3454
3455
3456 if(lSeqLength != 0)
3457 {
3458 error = backwardEditDistance2SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength);
3459 if(error == -1)
3460 {
3461 return -1;
3462 }
3463 }
3464
3465 matrixL[0] = '\0';
3466 matrixR[0] = '\0';
3467
3468
3469 ref = _msf_refGen + refIndex - 1;
3470
3471 if(error1+error > errThreshold)
3472 return -1;
3473
3474 ref = _msf_refGen + refIndex - 1;
3475
3476 rIndex = startIndex1+1;
3477
3478 int i0 = 0;
3479 int i1 = 0;
3480 int i2 = 0;
3481
3482 __m128i R0;
3483 __m128i R1;
3484
3485 __m128i Side1, Side2,Side; //side matrix
3486 __m128i Down1, Down2,Down; //down matrix
3487 __m128i Diag; //
3488
3489 __m128i tmp;
3490
3491 /* initialize */
3492 R0 = _mm_setzero_si128 ();
3493 R1 = _mm_setzero_si128 ();
3494 Diag = _mm_setzero_si128 ();
3495 Side1 = _mm_setzero_si128 ();
3496 Side2 = _mm_setzero_si128 ();
3497 Down1 = _mm_setzero_si128 ();
3498 Down2 = _mm_setzero_si128 ();
3499 Down = _mm_setzero_si128 ();
3500 Side = _mm_setzero_si128 ();
3501 tmp = _mm_setzero_si128 ();
3502 /* end initialize */
3503
3504 int mismatch[3] = {0,0,0};
3505
3506 if(lSeqLength != 0)
3507 {
3508 char *a;
3509 char *b;
3510
3511 a = ref-1;
3512 b = lSeq+lSeqLength-1;
3513
3514 R0 = _mm_insert_epi16(R0,0,0);
3515
3516 score[0][0] = 0;
3517
3518 R1 = _mm_insert_epi16(R1,1,0);
3519 R1 = _mm_insert_epi16(R1,1,1);
3520
3521 score[1][0] = 1;
3522 direction1[1][0] = 1;
3523 score[0][1] = 1;
3524 direction1[0][1] = 2;
3525
3526 mismatch[0] = ((a[0]) != (b[0]));
3527
3528 Diag = _mm_insert_epi16(Diag,2*e,0);
3529 Diag = _mm_insert_epi16(Diag,mismatch[0],1);
3530 Diag = _mm_insert_epi16(Diag,2*e,2);
3531
3532 Side1 = _mm_insert_epi16(Side1,1,0);
3533 Side1 = _mm_insert_epi16(Side1,1,1);
3534 Side1 = _mm_insert_epi16(Side1,2*e,2);
3535
3536 Side2 = _mm_insert_epi16(Side2,2*e,0);
3537 Side2 = _mm_insert_epi16(Side2,1,1);
3538 Side2 = _mm_insert_epi16(Side2,1,2);
3539
3540 Down1 = _mm_insert_epi16(Down1,1,0);
3541 Down1 = _mm_insert_epi16(Down1,1,1);
3542 Down1 = _mm_insert_epi16(Down1,2*e,2);
3543
3544 Down2 = _mm_insert_epi16(Down2,2*e,0);
3545 Down2 = _mm_insert_epi16(Down2,1,1);
3546 Down2 = _mm_insert_epi16(Down2,1,2);
3547
3548 tmp = _mm_slli_si128(R1,2);
3549
3550 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3551 R0 = _mm_min_epi16(R0,tmp+Down2);
3552
3553 i0 = _mm_extract_epi16(R0, 0);
3554 i1 = _mm_extract_epi16(R0, 1);
3555 i2 = _mm_extract_epi16(R0, 2);
3556
3557 score[0][2] = i0;
3558 score[1][1] = i1;
3559 score[2][0] = i2;
3560
3561 direction1[0][2] = 2;
3562 direction1[1][1] = ((mismatch[0] == 0)? 3 : 4);
3563 direction1[2][0] = 1;
3564
3565 for (i = 3; i < 2*lSeqLength; i++)
3566 {
3567 if(i % 2 ==1)
3568 {
3569 Diag = _mm_sub_epi8(Diag, Diag);
3570 mismatch[0] = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) );
3571 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3572 mismatch[1] = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) );
3573 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3574
3575 tmp = _mm_srli_si128(R0,2);
3576
3577 R1 = _mm_min_epi16(tmp+Side1, R1+Diag);
3578 R1 = _mm_min_epi16(R1,R0+Down1);
3579
3580 i0 = _mm_extract_epi16(R1, 0);
3581 i1 = _mm_extract_epi16(R1, 1);
3582
3583 score[i/2][i/2+1] = i0;
3584 score[i/2+1][i/2] = i1;
3585
3586 direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 :
3587 (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 :
3588 (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4;
3589
3590 direction1[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 :
3591 (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 :
3592 (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4;
3593
3594 if(i > 2 * lSeqLength - 2)
3595 {
3596 error = min(error, i1);
3597 if(error == i1)
3598 minIndex1 = i-lSeqLength;
3599 }
3600 }
3601
3602 else if(i % 2 == 0)
3603 {
3604 mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) );
3605 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3606 mismatch[1] = ( *(a-(i/2-1)) != *(b-(i/2-1)) );
3607 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3608 mismatch[2] = ( *(a-(i/2-2)) != *(b-(i/2)) );
3609 Diag = _mm_insert_epi16(Diag,mismatch[2],2);
3610
3611 tmp = _mm_slli_si128(R1,2);
3612
3613 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3614 R0 = _mm_min_epi16(R0,tmp+Down2);
3615
3616 i0 = _mm_extract_epi16(R0, 0);
3617 i1 = _mm_extract_epi16(R0, 1);
3618 i2 = _mm_extract_epi16(R0, 2);
3619
3620 score[i/2-1][i/2+1] = i0;
3621 score[i/2][i/2] = i1;
3622 score[i/2+1][i/2-1] = i2;
3623
3624 direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4;
3625
3626 direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 :
3627 (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 :
3628 (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4;
3629
3630 direction1[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2] == 0) ? 3 :
3631 (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4;
3632
3633 if( (i/2) % segLength == 0 && i1 == 0) // the segment has been processed no need to process it again
3634 {
3635 return -1;
3636 }
3637
3638 if(i == 2 * lSeqLength - 2)
3639 {
3640 error = i2;
3641 minIndex1 = i-lSeqLength;
3642 }
3643 }
3644 }
3645
3646 Down1 = _mm_insert_epi16(Down1,2*e,0);
3647
3648 //fill the first part of the error
3649 mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) );
3650 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3651 mismatch[1] = ( *(a-(i/2-1)) !=*(b-(i/2-1)) );
3652 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3653 Diag = _mm_insert_epi16(Diag,2*e,2);
3654
3655 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3656 R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1);
3657
3658 i0 = _mm_extract_epi16(R0, 0);
3659 i1 = _mm_extract_epi16(R0, 1);
3660
3661 error = min(error, i1);
3662 if(error == i1)
3663 minIndex1 = i-lSeqLength;
3664
3665 score[i/2-1][i/2+1] = i0;
3666 score[i/2][i/2] = i1;
3667
3668 direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 :
3669 (score[i/2-1][i/2+1]-score[i/2-1][i/2]) ? 2 : 4;
3670
3671 direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 :
3672 (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 :
3673 (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4;
3674
3675 //fill the second part of the error
3676 i++;
3677 Diag = _mm_sub_epi8(Diag, Diag);
3678 Diag = _mm_insert_epi16(Diag,2*e,0);
3679 mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) );
3680 Diag = _mm_insert_epi16(Diag,mismatch[0],1);
3681 Diag = _mm_insert_epi16(Diag,2*e,2);
3682
3683 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
3684 R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1);
3685
3686 i0 = _mm_extract_epi16(R1, 0);
3687 i1 = _mm_extract_epi16(R1, 1);
3688
3689 error = min(error, i1);
3690 if(error == i1)
3691 minIndex1 = i-lSeqLength;
3692
3693 score[i/2-1][i/2+2] = i0;
3694 score[i/2][i/2+1] = i1;
3695
3696 direction1[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 :
3697 (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 4;
3698
3699 direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2]) ? 3 :
3700 (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 :
3701 (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4;
3702
3703 //fill the last the last element of the matrix
3704 i++;
3705 Diag = _mm_sub_epi8(Diag, Diag);
3706 mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) );
3707 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3708
3709 Down = _mm_insert_epi16(Down,1,0);
3710
3711 Side = _mm_insert_epi16(Side,1,0);
3712
3713 tmp = _mm_srli_si128(R1,2);
3714
3715 R0 = _mm_min_epi16(R1+Down, R0+Diag);
3716 R0 = _mm_min_epi16(R0,tmp+Side);
3717
3718 i0 = _mm_extract_epi16(R0, 0);
3719
3720 error = min(error, i0);
3721 if(error == 0)
3722 return -1;
3723 if(error == i0)
3724 minIndex1 = i-lSeqLength;
3725 if(mismatch[0] == 0)
3726 direction1[lSeqLength][lSeqLength+errThreshold] = 3;
3727 else
3728 {
3729 if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength][lSeqLength+errThreshold-1] == 1)
3730 direction1[lSeqLength][lSeqLength+errThreshold] = 2;
3731 else if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength-1][lSeqLength+errThreshold] == 1)
3732 direction1[lSeqLength][lSeqLength+errThreshold] = 1;
3733 else
3734 direction1[lSeqLength][lSeqLength+errThreshold] = 4;
3735 }
3736 }
3737 error1 = error;
3738 error = 0;
3739
3740 directionIndex = lSeqLength;
3741 rIndex = minIndex1;
3742
3743
3744 *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ;
3745
3746 ref = ref + segLength;
3747
3748 if(rSeqLength <= e)
3749 {
3750 char *a;
3751 char *b;
3752
3753 int tmp_index = 0;
3754
3755 a = ref;
3756 b = rSeq;
3757
3758 for(tmp_index = 0; tmp_index < rSeqLength; tmp_index++)
3759 {
3760 matrixR[tmp_index] = (a[tmp_index]==b[tmp_index]) ? 'M' : a[tmp_index] ;
3761 }
3762 matrixR[tmp_index] = '\0';
3763 }
3764 else if(rSeqLength != 0 && rSeqLength >= e)
3765 {
3766 char *a;
3767 char *b;
3768
3769 a = ref;
3770 b = rSeq;
3771
3772 R0 = _mm_sub_epi8(R0, R0);
3773 R1 = _mm_sub_epi8(R1, R1);
3774
3775 R0 = _mm_insert_epi16(R0,0,0);
3776
3777 score[0][0] = 0;
3778
3779 R1 = _mm_insert_epi16(R1,1,0);
3780 R1 = _mm_insert_epi16(R1,1,1);
3781
3782 score[1][0] = 1;
3783 direction2[1][0] = 1;
3784 score[0][1] = 1;
3785 direction2[0][1] = 2;
3786
3787 mismatch[0] = ((a[0]) != (b[0]));
3788
3789 Diag = _mm_insert_epi16(Diag,2*e,0);
3790 Diag = _mm_insert_epi16(Diag,mismatch[0],1);
3791 Diag = _mm_insert_epi16(Diag,2*e,2);
3792
3793 Side1 = _mm_insert_epi16(Side1,1,0);
3794 Side1 = _mm_insert_epi16(Side1,1,1);
3795 Side1 = _mm_insert_epi16(Side1,2*e,2);
3796
3797 Side2 = _mm_insert_epi16(Side2,2*e,0);
3798 Side2 = _mm_insert_epi16(Side2,1,1);
3799 Side2 = _mm_insert_epi16(Side2,1,2);
3800
3801 Down1 = _mm_insert_epi16(Down1,1,0);
3802 Down1 = _mm_insert_epi16(Down1,1,1);
3803 Down1 = _mm_insert_epi16(Down1,2*e,2);
3804
3805 Down2 = _mm_insert_epi16(Down2,2*e,0);
3806 Down2 = _mm_insert_epi16(Down2,1,1);
3807 Down2 = _mm_insert_epi16(Down2,1,2);
3808
3809 tmp = _mm_slli_si128(R1,2);
3810
3811 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3812 R0 = _mm_min_epi16(R0,tmp+Down2);
3813
3814 i0 = _mm_extract_epi16(R0, 0);
3815 i1 = _mm_extract_epi16(R0, 1);
3816 i2 = _mm_extract_epi16(R0, 2);
3817
3818 score[0][2] = i0;
3819 score[1][1] = i1;
3820 score[2][0] = i2;
3821
3822 direction2[0][2] = 2;
3823 direction2[1][1] = ((mismatch[0] == 0)? 3 : 4);
3824 direction2[2][0] = 1;
3825
3826
3827 for (i = 3; i < 2*rSeqLength; i++)
3828 {
3829 if(i % 2 ==1)
3830 {
3831 Diag = _mm_sub_epi8(Diag, Diag);
3832 mismatch[0] = ((a[(i+1)/2-1]) != (b[(i-1)/2-1]));
3833 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3834 mismatch[1] = ((a[(i-1)/2-1]) != (b[(i+1)/2-1]));
3835 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3836
3837 tmp = _mm_srli_si128(R0,2);
3838
3839 R1 = _mm_min_epi16(tmp+Side1, R1+Diag);
3840 R1 = _mm_min_epi16(R1,R0+Down1);
3841
3842 i0 = _mm_extract_epi16(R1, 0);
3843 i1 = _mm_extract_epi16(R1, 1);
3844
3845 score[i/2][i/2+1] = i0;
3846 score[i/2+1][i/2] = i1;
3847
3848 direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 :
3849 (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 :
3850 (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4;
3851
3852 direction2[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 :
3853 (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 :
3854 (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4;
3855
3856
3857 if(i > 2 * rSeqLength - 2)
3858 {
3859 error = min(error, i1);
3860 if(error == i1)
3861 minIndex2 = i-rSeqLength;
3862 }
3863 }
3864
3865 else if(i % 2 == 0)
3866 {
3867 mismatch[0] = ((a[i/2]) != (b[i/2-2]));
3868 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3869 mismatch[1] = ((a[i/2-1]) != (b[i/2-1]));
3870 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3871 mismatch[2] = ((a[i/2-2]) != (b[i/2]));
3872 Diag = _mm_insert_epi16(Diag,mismatch[2],2);
3873
3874 tmp = _mm_slli_si128(R1,2);
3875
3876 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3877 R0 = _mm_min_epi16(R0,tmp+Down2);
3878
3879 i0 = _mm_extract_epi16(R0, 0);
3880 i1 = _mm_extract_epi16(R0, 1);
3881 i2 = _mm_extract_epi16(R0, 2);
3882
3883 score[i/2-1][i/2+1] = i0;
3884 score[i/2][i/2] = i1;
3885 score[i/2+1][i/2-1] = i2;
3886
3887 direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 :
3888 (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4;
3889
3890 direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 :
3891 (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 :
3892 (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4;
3893
3894 direction2[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2]==0) ? 3 :
3895 (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4;
3896
3897
3898 if(i == 2 * rSeqLength - 2)
3899 {
3900 error = i2;
3901 minIndex2 = i-rSeqLength;
3902 }
3903 }
3904 }
3905
3906 Down1 = _mm_insert_epi16(Down1,2*e,0);
3907
3908 //fill the first part of the error
3909 mismatch[0] = ((a[i/2]) != (b[i/2-2]));
3910 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3911 mismatch[1] = ((a[i/2-1]) != (b[i/2-1]));
3912 Diag = _mm_insert_epi16(Diag,mismatch[1],1);
3913 Diag = _mm_insert_epi16(Diag,2*e,2);
3914
3915 R0 = _mm_min_epi16(R1+Side1, R0+Diag);
3916 R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1);
3917
3918 i0 = _mm_extract_epi16(R0, 0);
3919 i1 = _mm_extract_epi16(R0, 1);
3920
3921 error = min(error, i1);
3922 if(error == i1)
3923 minIndex2 = i-rSeqLength;
3924
3925 score[i/2-1][i/2+1] = i0;
3926 score[i/2][i/2] = i1;
3927
3928 direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 :
3929 (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4;
3930
3931 direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 :
3932 (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 :
3933 (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4;
3934
3935
3936 //fill the second part of the error
3937 i++;
3938 Diag = _mm_sub_epi8(Diag, Diag);
3939 Diag = _mm_insert_epi16(Diag,2*e,0);
3940 mismatch[0] = ((a[i/2]) != (b[rSeqLength-1]));
3941 Diag = _mm_insert_epi16(Diag,mismatch[0],1);
3942 Diag = _mm_insert_epi16(Diag,2*e,2);
3943
3944 R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag);
3945 R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1);
3946
3947 i0 = _mm_extract_epi16(R1, 0);
3948 i1 = _mm_extract_epi16(R1, 1);
3949
3950 error = min(error, i1);
3951 if(error == i1)
3952 minIndex2 = i-rSeqLength;
3953
3954 score[i/2-1][i/2+2] = i0;
3955 score[i/2][i/2+1] = i1;
3956
3957 direction2[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 :
3958 (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 3;
3959
3960 direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 :
3961 (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 :
3962 (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4;
3963
3964
3965 //fill the last the last element of the matrix
3966 i++;
3967 Diag = _mm_sub_epi8(Diag, Diag);
3968 mismatch[0] = ((a[i/2]) != (b[rSeqLength-1]));
3969 Diag = _mm_insert_epi16(Diag,mismatch[0],0);
3970
3971 Down = _mm_sub_epi8(Down, Down);
3972 Down = _mm_insert_epi16(Down,1,0);
3973
3974 Side = _mm_sub_epi8(Side, Side);
3975 Side = _mm_insert_epi16(Side,1,0);
3976
3977 tmp = _mm_srli_si128(R1,2);
3978
3979 R0 = _mm_min_epi16(R1+Down, R0+Diag);
3980 R0 = _mm_min_epi16(R0,tmp+Side);
3981
3982 i0 = _mm_extract_epi16(R0, 0);
3983
3984 error = min(error, i0);
3985 if(error == i0)
3986 minIndex2 = i-rSeqLength;
3987
3988 if(mismatch[0] == 0)
3989 direction2[rSeqLength][rSeqLength+errThreshold] = 3;
3990 else
3991 {
3992 if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength][rSeqLength+errThreshold-1] == 1)
3993 direction2[lSeqLength][lSeqLength+errThreshold] = 2;
3994 else if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength-1][rSeqLength+errThreshold] == 1)
3995 direction2[rSeqLength][rSeqLength+errThreshold] = 1;
3996 else
3997 direction2[rSeqLength][rSeqLength+errThreshold] = 4;
3998 }
3999
4000 }
4001
4002 totalError = error1 + error;
4003
4004 size = 0;
4005 directionIndex = rSeqLength;
4006 rIndex = minIndex2;
4007
4008
4009 if(rSeqLength > e)
4010 {
4011 while(directionIndex != 0 || rIndex != 0)
4012 {
4013
4014 if(direction2[directionIndex][rIndex] == 3)
4015 {
4016 matrixR[size] = 'M';
4017 rIndex--;
4018 directionIndex--;
4019 }
4020 else if(direction2[directionIndex][rIndex] == 4)
4021 {
4022 matrixR[size] = *(ref+rIndex-1);
4023 rIndex--;
4024 directionIndex--;
4025 }
4026 else if(direction2[directionIndex][rIndex] == 2)
4027 {
4028 matrixR[size] = *(ref+rIndex-1);
4029 size++;
4030 matrixR[size] = 'D';
4031 rIndex--;
4032 }
4033 else
4034 {
4035 matrixR[size] = *(rSeq+directionIndex-1);
4036 size++;
4037 matrixR[size] = 'I';
4038 directionIndex--;
4039 }
4040 size++;
4041 }
4042 matrixR[size] = '\0';
4043 }
4044 size = 0;
4045 directionIndex = lSeqLength;
4046 rIndex = minIndex1;
4047
4048 while(directionIndex != 0 || rIndex != 0)
4049 {
4050
4051 if(direction1[directionIndex][rIndex] == 3)
4052 {
4053 matrixL[size] = 'M';
4054 rIndex--;
4055 directionIndex--;
4056 }
4057 else if(direction1[directionIndex][rIndex] == 4)
4058 {
4059 matrixL[size] = *(tempref-rIndex);
4060 rIndex--;
4061 directionIndex--;
4062 }
4063 else if(direction1[directionIndex][rIndex] == 2)
4064 {
4065 matrixL[size] = 'D';
4066 size++;
4067 matrixL[size] = *(tempref-rIndex);
4068 rIndex--;
4069 }
4070 else
4071 {
4072 matrixL[size] = 'I';
4073 size++;
4074 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
4075 directionIndex--;
4076 }
4077
4078 size++;
4079 }
4080
4081 matrixL[size] = '\0';
4082
4083 char middle[200];
4084 middle[0] = '\0';
4085
4086 for(i = 0; i < segLength; i++)
4087 middle[i] = 'M';
4088 middle[segLength] = '\0';
4089
4090 char rmatrixR[200];
4091
4092 reverse(matrixR, rmatrixR, strlen(matrixR));
4093
4094 sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR);
4095
4096 return totalError;
4097 }
4098
4099 int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue)
4100 {
4101
4102 int i = 0;
4103
4104 char * ref;
4105 char * tempref;
4106
4107 int rIndex = 0; //reference Index
4108
4109 int error = 0;
4110 int error1 = 0;
4111
4112 int error2 = 0;
4113 int error3 = 0;
4114 int totalError = 0;
4115 int errorSegment = 0;
4116
4117 int ERROR_BOUND = errThreshold;
4118
4119
4120 /*
4121 1: Up
4122 2: Side
4123 3: Diagnoal Match
4124 4: Diagnoal Mismatch
4125 */
4126
4127 int min = 0;
4128 int minIndex1 = 0;
4129 int minIndex2 = 0;
4130
4131 int directionIndex = 0;
4132
4133
4134 int size = 0;
4135
4136 ref = _msf_refGen + refIndex - 1;
4137 tempref = _msf_refGen + refIndex - 1;
4138
4139
4140 if(lSeqLength != 0)
4141 {
4142 error3 = backwardEditDistance4SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength);
4143 if(error3 == -1 || error3 == 0){
4144 return -1;
4145 }
4146 }
4147
4148 if(rSeqLength != 0)
4149 {
4150 error2 = forwardEditDistance4SSE2(ref+segLength, rSeqLength, rSeq, rSeqLength);
4151 if(error2 == -1)
4152 return -1;
4153 }
4154
4155 if(error2 + error3 > errThreshold)
4156 return -1;
4157
4158 rIndex = 1;
4159
4160 int prevError = 0;
4161
4162 int tempUp = 0;
4163 int tempDown = 0;
4164
4165 int errorString = 0;
4166
4167 int upValue;
4168 int diagValue;
4169 int sideValue;
4170
4171 while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0)
4172 {
4173 tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 );
4174 tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1);
4175 for(i = tempUp ; i < tempDown ; i++)
4176 {
4177 errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i));
4178
4179 upValue = scoreB[i-1][rIndex]+1;
4180 diagValue = scoreB[i-1][rIndex-1]+ !errorString;
4181 sideValue = scoreB[i][rIndex-1]+1;
4182
4183 if(i != tempUp && i != tempDown-1)
4184 scoreB[i][rIndex] = min3(sideValue, diagValue , upValue);
4185
4186 else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength )
4187 scoreB[i][rIndex] = min(sideValue, diagValue);
4188 else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) )
4189 scoreB[i][rIndex] = sideValue;
4190 else
4191 scoreB[i][rIndex] = min(diagValue , upValue);
4192
4193 if(i == tempUp)
4194 error = scoreB[i][rIndex];
4195 else if(error > scoreB[i][rIndex])
4196 error = scoreB[i][rIndex];
4197 }
4198 if(rIndex <= lSeqLength)
4199 {
4200 errorSegment = error-prevError;
4201 }
4202 rIndex++;
4203 }
4204
4205 if(lSeqLength != 0)
4206 {
4207 min = scoreB[lSeqLength][lSeqLength+errThreshold];
4208 minIndex1 = lSeqLength + errThreshold;
4209
4210 // Find the Best error for all the possible ways.
4211 for(i = 1; i <= 2*errThreshold; i++)
4212 {
4213 if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0)
4214 {
4215 min = scoreB[lSeqLength][lSeqLength+errThreshold-i];
4216 minIndex1 = lSeqLength+errThreshold-i;
4217 }
4218 }
4219 error = scoreB[lSeqLength][minIndex1];
4220 }
4221
4222 error1 = error;
4223
4224 error = 0;
4225 errorSegment = 0;
4226
4227 directionIndex = lSeqLength;
4228 rIndex = minIndex1;
4229
4230
4231 *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ;
4232
4233 ref = ref + segLength;
4234
4235 if(rSeqLength != 0)
4236 {
4237 rIndex = 1;
4238 while(rIndex <= rSeqLength+errThreshold-error1)
4239 {
4240 tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1;
4241 tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1);
4242 for(i = tempUp; i < tempDown ; i++)
4243 {
4244 errorString = (*(ref+rIndex-1) == *(rSeq+i-1));
4245
4246 upValue = scoreF[i-1][rIndex]+1;
4247 diagValue = scoreF[i-1][rIndex-1]+ !errorString;
4248 sideValue = scoreF[i][rIndex-1]+1;
4249
4250 if(i != tempUp && i != tempDown-1)
4251 scoreF[i][rIndex] = min3(sideValue, diagValue , upValue);
4252 else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength )
4253 scoreF[i][rIndex] = min(sideValue, diagValue);
4254 else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) )
4255 scoreF[i][rIndex] = sideValue;
4256 else
4257 scoreF[i][rIndex] = min(diagValue , upValue);
4258
4259 if(i == tempUp)
4260 error = scoreF[i][rIndex];
4261 if(error > scoreF[i][rIndex])
4262 error = scoreF[i][rIndex];
4263 }
4264 if(rIndex <= rSeqLength)
4265 {
4266 errorSegment = error;
4267 }
4268
4269 rIndex++;
4270 }
4271
4272 min = scoreF[rSeqLength][rSeqLength+errThreshold-error1];
4273 minIndex2 = rSeqLength + errThreshold-error1;
4274
4275 // Find the Best error for all the possible ways.
4276 for(i = 1; i <= 2*(errThreshold-error1); i++)
4277 {
4278 if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0)
4279 {
4280 min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i];
4281 minIndex2 = rSeqLength+errThreshold-error1-i;
4282 }
4283 }
4284 error = scoreF[rSeqLength][minIndex2];
4285 }
4286
4287 totalError = error + error1;
4288
4289 if(errThreshold > 4)
4290 printf("ERROR in errorThreshold.\n");
4291
4292
4293 if(totalError != error2 + error3 && totalError > errThreshold)
4294 {
4295 printf("ErrorF=%d, ErrorB=%d Error=%d Error=%d\n", error2,error3,error1,error);
4296
4297 scanf("%d", &i);
4298 }
4299
4300 char matrixR[200];
4301 char matrixL[200];
4302
4303 matrixR[0] = '\0';
4304 matrixL[0] = '\0';
4305
4306 size = 0;
4307 directionIndex = rSeqLength;
4308 rIndex = minIndex2;
4309
4310 while(directionIndex != 0 || rIndex != 0)
4311 {
4312 if(directionIndex-rIndex == errThreshold)
4313 {
4314 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1)
4315 {
4316 matrixR[size] = *(rSeq+directionIndex-1);
4317 size++;
4318 matrixR[size] = 'I';
4319 directionIndex--;
4320 }
4321 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4322 {
4323 matrixR[size] = *(ref+rIndex-1);
4324 rIndex--;
4325 directionIndex--;
4326 }
4327 else
4328 {
4329 matrixR[size] = 'M';
4330 rIndex--;
4331 directionIndex--;
4332 }
4333
4334 }
4335 else if(rIndex - directionIndex == errThreshold)
4336 {
4337 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1)
4338 {
4339 matrixR[size] = *(ref+rIndex-1);
4340 size++;
4341 matrixR[size] = 'D';
4342 rIndex--;
4343 }
4344 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4345 {
4346 matrixR[size] = *(ref+rIndex-1);
4347 rIndex--;
4348 directionIndex--;
4349 }
4350 else
4351 {
4352 matrixR[size] = 'M';
4353 rIndex--;
4354 directionIndex--;
4355 }
4356 }
4357 else
4358 {
4359 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0)
4360 {
4361 matrixR[size] = *(rSeq+directionIndex-1);
4362 size++;
4363 matrixR[size] = 'I';
4364 directionIndex--;
4365 }
4366 else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0)
4367 {
4368 matrixR[size] = *(ref+rIndex-1);
4369 size++;
4370 matrixR[size] = 'D';
4371 rIndex--;
4372 }
4373 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4374 {
4375 matrixR[size] = *(ref+rIndex-1);
4376 rIndex--;
4377 directionIndex--;
4378 }
4379 else
4380 {
4381 matrixR[size] = 'M';
4382 rIndex--;
4383 directionIndex--;
4384 }
4385 }
4386 size++;
4387 }
4388 matrixR[size] = '\0';
4389
4390 size = 0;
4391 directionIndex = lSeqLength;
4392 rIndex = minIndex1;
4393
4394
4395 while(directionIndex != 0 || rIndex != 0)
4396 {
4397 if(directionIndex-rIndex == errThreshold)
4398 {
4399 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1)
4400 {
4401 matrixL[size] = 'I';
4402 size++;
4403 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
4404 directionIndex--;
4405 }
4406 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4407 {
4408 matrixL[size] = *(tempref-rIndex);
4409 rIndex--;
4410 directionIndex--;
4411 }
4412 else
4413 {
4414 matrixL[size] = 'M';
4415 rIndex--;
4416 directionIndex--;
4417 }
4418
4419 }
4420 else if(rIndex - directionIndex == errThreshold)
4421 {
4422 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1)
4423 {
4424 matrixL[size] = 'D';
4425 size++;
4426 matrixL[size] = *(tempref-rIndex);
4427 rIndex--;
4428 }
4429 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4430 {
4431 matrixL[size] = *(tempref-rIndex);
4432 rIndex--;
4433 directionIndex--;
4434 }
4435 else
4436 {
4437 matrixL[size] = 'M';
4438 rIndex--;
4439 directionIndex--;
4440 }
4441 }
4442 else
4443 {
4444 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0)
4445 {
4446 matrixL[size] = 'I';
4447 size++;
4448 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
4449 directionIndex--;
4450 }
4451 else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0)
4452 {
4453 matrixL[size] = 'D';
4454 size++;
4455 matrixL[size] = *(tempref-rIndex);
4456 rIndex--;
4457 }
4458 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4459 {
4460 matrixL[size] = *(tempref-rIndex);
4461 rIndex--;
4462 directionIndex--;
4463 }
4464 else
4465 {
4466 matrixL[size] = 'M';
4467 rIndex--;
4468 directionIndex--;
4469 }
4470 }
4471
4472 size++;
4473 }
4474
4475 matrixL[size] = '\0';
4476 char middle[200];
4477 middle[0] = '\0';
4478
4479 for(i = 0; i < segLength; i++)
4480 middle[i] = 'M';
4481 middle[segLength] = '\0';
4482
4483 char rmatrixR[200];
4484
4485 reverse(matrixR, rmatrixR, strlen(matrixR));
4486
4487 sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR);
4488
4489 return totalError;
4490
4491 }
4492
4493 int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength,
4494 char *matrix, int *map_location, short *seqHashValue)
4495 {
4496 int i = 0;
4497
4498 char * ref;
4499 char * tempref;
4500
4501 int rIndex = 0; //reference Index
4502
4503 int error = 0;
4504 int error1 = 0;
4505
4506 int error2 = 0;
4507 int error3 = 0;
4508 int totalError = 0;
4509 int errorSegment = 0;
4510
4511 int ERROR_BOUND = min(4, errThreshold);
4512
4513
4514 /*
4515 1: Up
4516 2: Side
4517 3: Diagnoal Match
4518 4: Diagnoal Mismatch
4519 */
4520
4521 int min = 0;
4522 int minIndex1 = 0;
4523 int minIndex2 = 0;
4524
4525 int directionIndex = 0;
4526
4527
4528 int size = 0;
4529
4530 ref = _msf_refGen + refIndex - 1;
4531 tempref = _msf_refGen + refIndex - 1;
4532
4533
4534 if(lSeqLength != 0)
4535 {
4536 error3 = backwardEditDistanceSSE2Extention(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength);
4537 if(error3 == -1){
4538 return -1;
4539 }
4540 }
4541
4542 if(rSeqLength != 0)
4543 {
4544 error2 = forwardEditDistanceSSE2Extention(ref+segLength, rSeqLength, rSeq, rSeqLength);
4545 if(error2 == -1)
4546 return -1;
4547 }
4548
4549 if(error2 + error3 > errThreshold)
4550 return -1;
4551
4552 rIndex = 1;
4553
4554 int prevError = 0;
4555
4556 int tempUp = 0;
4557 int tempDown = 0;
4558
4559 int errorString = 0;
4560
4561 int upValue;
4562 int diagValue;
4563 int sideValue;
4564 if(lSeqLength > ERROR_BOUND)
4565 {
4566 while(rIndex <= lSeqLength+ERROR_BOUND && lSeqLength != 0)
4567 {
4568 tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 );
4569 tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1);
4570 for(i = tempUp ; i < tempDown ; i++)
4571 {
4572 errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i));
4573
4574 upValue = scoreB[i-1][rIndex]+1;
4575 diagValue = scoreB[i-1][rIndex-1]+ !errorString;
4576 sideValue = scoreB[i][rIndex-1]+1;
4577
4578 if(i != tempUp && i != tempDown-1)
4579 scoreB[i][rIndex] = min3(sideValue, diagValue , upValue);
4580
4581 else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength )
4582 scoreB[i][rIndex] = min(sideValue, diagValue);
4583 else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) )
4584 scoreB[i][rIndex] = sideValue;
4585 else
4586 scoreB[i][rIndex] = min(diagValue , upValue);
4587
4588 if(i == tempUp)
4589 error = scoreB[i][rIndex];
4590 else if(error > scoreB[i][rIndex])
4591 error = scoreB[i][rIndex];
4592 }
4593 if(rIndex <= lSeqLength)
4594 {
4595 errorSegment = error-prevError;
4596 }
4597 rIndex++;
4598 }
4599
4600 if(lSeqLength != 0)
4601 {
4602 min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND];
4603 minIndex1 = lSeqLength + ERROR_BOUND;
4604
4605 // Find the Best error for all the possible ways.
4606 for(i = 1; i <= 2*ERROR_BOUND; i++)
4607 {
4608 if(min >= scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i] && lSeqLength+ERROR_BOUND-i > 0)
4609 {
4610 min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i];
4611 minIndex1 = lSeqLength+ERROR_BOUND-i;
4612 }
4613 }
4614 error = scoreB[lSeqLength][minIndex1];
4615 }
4616 }
4617 else
4618 {
4619 int j = 0;
4620 for(i = 1; i <= lSeqLength; i++)
4621 {
4622 for(j = 1; j <= lSeqLength; j++)
4623 {
4624 scoreB[i][j] = min3(scoreB[i-1][j-1]+ (*(ref-j) != *(lSeq+lSeqLength-i) ),scoreB[i][j-1]+1 ,scoreB[i-1][j]+1);
4625 }
4626 }
4627 error = scoreB[lSeqLength][lSeqLength];
4628 minIndex1 = lSeqLength;
4629
4630 }
4631 error1 = error;
4632
4633 error = 0;
4634 errorSegment = 0;
4635
4636 directionIndex = lSeqLength;
4637 rIndex = minIndex1;
4638
4639 *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ;
4640
4641 ref = ref + segLength;
4642
4643 if(rSeqLength != 0 && rSeqLength > ERROR_BOUND)
4644 {
4645 ERROR_BOUND = min(ERROR_BOUND, rSeqLength);
4646
4647 if(rSeqLength == ERROR_BOUND)
4648 {
4649 for(i=0; i < 2*ERROR_BOUND; i++)
4650 scoreF[0][i] = i;
4651 }
4652
4653 rIndex = 1;
4654 while(rIndex <= rSeqLength+ERROR_BOUND)
4655 {
4656 tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1;
4657 tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1);
4658 for(i = tempUp; i < tempDown ; i++)
4659 {
4660 errorString = (*(ref+rIndex-1) == *(rSeq+i-1));
4661 upValue = scoreF[i-1][rIndex]+1;
4662 diagValue = scoreF[i-1][rIndex-1]+ !errorString;
4663 sideValue = scoreF[i][rIndex-1]+1;
4664
4665 if(i != tempUp && i != tempDown-1)
4666 scoreF[i][rIndex] = min3(sideValue, diagValue , upValue);
4667 else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength )
4668 scoreF[i][rIndex] = min(sideValue, diagValue);
4669 else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) )
4670 scoreF[i][rIndex] = sideValue;
4671 else
4672 scoreF[i][rIndex] = min(diagValue , upValue);
4673
4674 if(i == tempUp)
4675 error = scoreF[i][rIndex];
4676 if(error > scoreF[i][rIndex])
4677 error = scoreF[i][rIndex];
4678 }
4679 if(rIndex <= rSeqLength)
4680 {
4681 errorSegment = error;
4682 }
4683 rIndex++;
4684 }
4685 min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND];
4686 minIndex2 = rSeqLength + ERROR_BOUND;
4687
4688 // Find the Best error for all the possible ways.
4689 for(i = 1; i <= 2*ERROR_BOUND; i++)
4690 {
4691 if(min > scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i] && rSeqLength+ERROR_BOUND-i > 0)
4692 {
4693 min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i];
4694 minIndex2 = rSeqLength+ERROR_BOUND-i;
4695 }
4696 }
4697 error = scoreF[rSeqLength][minIndex2];
4698 }
4699 else
4700 {
4701 int j = 0;
4702 for(i = 1; i <= rSeqLength; i++)
4703 {
4704 for(j = 1; j <= rSeqLength; j++)
4705 {
4706 scoreF[i][j] = min3(scoreF[i-1][j-1]+ (*(ref+j-1) != *(rSeq+i-1) ),scoreF[i][j-1]+1 ,scoreF[i-1][j]+1);
4707 }
4708 }
4709 error = scoreF[rSeqLength][rSeqLength];
4710 minIndex2 = rSeqLength;
4711 }
4712
4713 totalError = error + error1;
4714
4715 if(totalError != error2+error3)
4716 {
4717 for(i = 0; i < lSeqLength; i++)
4718 printf("%c", *(tempref-1-i));
4719 printf("\n");
4720 for(i = 0; i < lSeqLength; i++)
4721 printf("%c", *(lSeq+i));
4722 printf("\n");
4723
4724 for(i = 0; i < rSeqLength; i++)
4725 printf("%c", *(tempref+segLength+i));
4726 printf("\n");
4727
4728 for(i = 0; i < rSeqLength; i++)
4729 printf("%c", *(rSeq+i));
4730 printf("\n");
4731
4732 printf("ERROR=%d\n", totalError);
4733 printf("ERROR_SSE=%d\n", error3+error2);
4734
4735 printf("ERROR_SSE_back=%d E_SSE_forw=%d\n", error3, error2);
4736 printf("ERROR_back=%d E_forw=%d\n", error1, error);
4737
4738 }
4739
4740 char matrixR[200];
4741 char matrixL[200];
4742
4743 matrixR[0] = '\0';
4744 matrixL[0] = '\0';
4745
4746 size = 0;
4747 directionIndex = rSeqLength;
4748 rIndex = minIndex2;
4749
4750
4751 while(directionIndex != 0 || rIndex != 0)
4752 {
4753 if(directionIndex-rIndex == errThreshold)
4754 {
4755 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1)
4756 {
4757 matrixR[size] = *(rSeq+directionIndex-1);
4758 size++;
4759 matrixR[size] = 'I';
4760 directionIndex--;
4761 }
4762 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4763 {
4764 matrixR[size] = *(ref+rIndex-1);
4765 rIndex--;
4766 directionIndex--;
4767 }
4768 else
4769 {
4770 matrixR[size] = 'M';
4771 rIndex--;
4772 directionIndex--;
4773 }
4774
4775 }
4776 else if(rIndex - directionIndex == errThreshold)
4777 {
4778 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1)
4779 {
4780 matrixR[size] = *(ref+rIndex-1);
4781 size++;
4782 matrixR[size] = 'D';
4783 rIndex--;
4784 }
4785 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4786 {
4787 matrixR[size] = *(ref+rIndex-1);
4788 rIndex--;
4789 directionIndex--;
4790 }
4791 else
4792 {
4793 matrixR[size] = 'M';
4794 rIndex--;
4795 directionIndex--;
4796 }
4797 }
4798 else
4799 {
4800 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0)
4801 {
4802 matrixR[size] = *(rSeq+directionIndex-1);
4803 size++;
4804 matrixR[size] = 'I';
4805 directionIndex--;
4806 }
4807 else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0)
4808 {
4809 matrixR[size] = *(ref+rIndex-1);
4810 size++;
4811 matrixR[size] = 'D';
4812 rIndex--;
4813 }
4814 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
4815 {
4816 matrixR[size] = *(ref+rIndex-1);
4817 rIndex--;
4818 directionIndex--;
4819 }
4820 else
4821 {
4822 matrixR[size] = 'M';
4823 rIndex--;
4824 directionIndex--;
4825 }
4826 }
4827 size++;
4828 }
4829 matrixR[size] = '\0';
4830
4831 size = 0;
4832 directionIndex = lSeqLength;
4833 rIndex = minIndex1;
4834
4835
4836 while(directionIndex != 0 || rIndex != 0)
4837 {
4838 if(directionIndex-rIndex == errThreshold)
4839 {
4840 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1)
4841 {
4842 matrixL[size] = 'I';
4843 size++;
4844 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
4845 directionIndex--;
4846 }
4847 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4848 {
4849 matrixL[size] = *(tempref-rIndex);
4850 rIndex--;
4851 directionIndex--;
4852 }
4853 else
4854 {
4855 matrixL[size] = 'M';
4856 rIndex--;
4857 directionIndex--;
4858 }
4859
4860 }
4861 else if(rIndex - directionIndex == errThreshold)
4862 {
4863 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1)
4864 {
4865 matrixL[size] = 'D';
4866 size++;
4867 matrixL[size] = *(tempref-rIndex);
4868 rIndex--;
4869 }
4870 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4871 {
4872 matrixL[size] = *(tempref-rIndex);
4873 rIndex--;
4874 directionIndex--;
4875 }
4876 else
4877 {
4878 matrixL[size] = 'M';
4879 rIndex--;
4880 directionIndex--;
4881 }
4882 }
4883 else
4884 {
4885 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0)
4886 {
4887 matrixL[size] = 'I';
4888 size++;
4889 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
4890 directionIndex--;
4891 }
4892 else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0)
4893 {
4894 matrixL[size] = 'D';
4895 size++;
4896 matrixL[size] = *(tempref-rIndex);
4897 rIndex--;
4898 }
4899 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
4900 {
4901 matrixL[size] = *(tempref-rIndex);
4902 rIndex--;
4903 directionIndex--;
4904 }
4905 else
4906 {
4907 matrixL[size] = 'M';
4908 rIndex--;
4909 directionIndex--;
4910 }
4911 }
4912 size++;
4913 }
4914 matrixL[size] = '\0';
4915
4916 char middle[200];
4917 middle[0] = '\0';
4918 for(i = 0; i < segLength; i++)
4919 middle[i] = 'M';
4920 middle[segLength] = '\0';
4921
4922 char rmatrixR[200];
4923
4924 reverse(matrixR, rmatrixR, strlen(matrixR));
4925
4926 sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR);
4927
4928
4929 return totalError;
4930
4931 }
4932
4933
4934 int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue)
4935 {
4936
4937 int i = 0;
4938
4939 char * ref;
4940 char * tempref;
4941
4942 int rIndex = 0; //reference Index
4943
4944 int error = 0;
4945 int error1 = 0;
4946
4947 int error2 = 0;
4948 int error3 = 0;
4949
4950 int totalError = 0;
4951 int errorSegment = 0;
4952
4953 int ERROR_BOUND = errThreshold;
4954
4955 /*
4956 1: Up
4957 2: Side
4958 3: Diagnoal Match
4959 4: Diagnoal Mismatch
4960 */
4961
4962 int min = 0;
4963 int minIndex1 = 0;
4964 int minIndex2 = 0;
4965
4966 int directionIndex = 0;
4967
4968
4969 int size = 0;
4970
4971 ref = _msf_refGen + refIndex - 1;
4972 tempref = _msf_refGen + refIndex - 1;
4973
4974
4975 if(rSeqLength != 0)
4976 {
4977 if(errThreshold %2 == 1)
4978 error2 = forwardEditDistanceSSE2Odd(ref+segLength, rSeqLength, rSeq, rSeqLength);
4979 else
4980 error2 = forwardEditDistanceSSE2G(ref+segLength, rSeqLength, rSeq, rSeqLength);
4981 if(error2 == -1)
4982 return -1;
4983 }
4984
4985 if(lSeqLength != 0)
4986 {
4987 if(errThreshold % 2 == 1)
4988 error3 = backwardEditDistanceSSE2Odd(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength);
4989 else
4990 error3 = backwardEditDistanceSSE2G(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength);
4991 if(error3 == -1 || error3 == 0){
4992 return -1;
4993 }
4994 }
4995
4996 if(error3 + error2 > errThreshold)
4997 return -1;
4998
4999 for(i = 0 ; i < errThreshold + 1; i++)
5000 {
5001 scoreB[0][i] = i;
5002 scoreB[i][0] = i;
5003 }
5004
5005 rIndex = 1;
5006 int prevError = 0;
5007
5008 int tempUp = 0;
5009 int tempDown = 0;
5010
5011 int errorString = 0;
5012
5013 int upValue;
5014 int diagValue;
5015 int sideValue;
5016
5017 while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0)
5018 {
5019 tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 );
5020 tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1);
5021 for(i = tempUp ; i < tempDown ; i++)
5022 {
5023 errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i));
5024
5025 upValue = scoreB[i-1][rIndex]+1;
5026 diagValue = scoreB[i-1][rIndex-1]+ !errorString;
5027 sideValue = scoreB[i][rIndex-1]+1;
5028
5029 if(i != tempUp && i != tempDown-1)
5030 scoreB[i][rIndex] = min3(sideValue, diagValue , upValue);
5031
5032 else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength )
5033 scoreB[i][rIndex] = min(sideValue, diagValue);
5034 else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) )
5035 scoreB[i][rIndex] = sideValue;
5036 else
5037 scoreB[i][rIndex] = min(diagValue , upValue);
5038
5039 if(i == tempUp)
5040 error = scoreB[i][rIndex];
5041 else if(error > scoreB[i][rIndex])
5042 error = scoreB[i][rIndex];
5043 }
5044 if(rIndex <= lSeqLength)
5045 {
5046 errorSegment = error-prevError;
5047 }
5048 rIndex++;
5049 }
5050 if(lSeqLength != 0)
5051 {
5052 min = scoreB[lSeqLength][lSeqLength+errThreshold];
5053 minIndex1 = lSeqLength + errThreshold;
5054
5055 // Find the Best error for all the possible ways.
5056 for(i = 1; i <= 2*errThreshold; i++)
5057 {
5058 if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0)
5059 {
5060 min = scoreB[lSeqLength][lSeqLength+errThreshold-i];
5061 minIndex1 = lSeqLength+errThreshold-i;
5062 }
5063 }
5064 error = scoreB[lSeqLength][minIndex1];
5065 }
5066
5067 error1 = error;
5068
5069 error = 0;
5070 errorSegment = 0;
5071
5072 directionIndex = lSeqLength;
5073 rIndex = minIndex1;
5074
5075 *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ;
5076
5077 ref = ref + segLength;
5078
5079 if(rSeqLength != 0)
5080 {
5081 for(i = 0 ; i < errThreshold + 1; i++)
5082 {
5083 scoreF[0][i] = i;
5084 scoreF[i][0] = i;
5085 }
5086
5087
5088 rIndex = 1;
5089 while(rIndex <= rSeqLength+errThreshold-error1)
5090 {
5091 tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1;
5092 tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1);
5093 for(i = tempUp; i < tempDown ; i++)
5094 {
5095 errorString = (*(ref+rIndex-1) == *(rSeq+i-1));
5096
5097 upValue = scoreF[i-1][rIndex]+1;
5098 diagValue = scoreF[i-1][rIndex-1]+ !errorString;
5099 sideValue = scoreF[i][rIndex-1]+1;
5100
5101 if(i != tempUp && i != tempDown-1)
5102 scoreF[i][rIndex] = min3(sideValue, diagValue , upValue);
5103 else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength )
5104 scoreF[i][rIndex] = min(sideValue, diagValue);
5105 else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) )
5106 scoreF[i][rIndex] = sideValue;
5107 else
5108 scoreF[i][rIndex] = min(diagValue , upValue);
5109
5110 if(i == tempUp)
5111 error = scoreF[i][rIndex];
5112 if(error > scoreF[i][rIndex])
5113 error = scoreF[i][rIndex];
5114 }
5115 if(rIndex <= rSeqLength)
5116 {
5117 errorSegment = error;
5118 }
5119 rIndex++;
5120 }
5121
5122 min = scoreF[rSeqLength][rSeqLength+errThreshold-error1];
5123 minIndex2 = rSeqLength + errThreshold-error1;
5124
5125 // Find the Best error for all the possible ways.
5126 for(i = 1; i <= 2*(errThreshold-error1); i++)
5127 {
5128 if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0)
5129 {
5130 min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i];
5131 minIndex2 = rSeqLength+errThreshold-error1-i;
5132 }
5133 }
5134 error = scoreF[rSeqLength][minIndex2];
5135 }
5136
5137 totalError = error + error1;
5138
5139
5140 if(totalError != error2 + error3 && totalError > errThreshold)
5141 {
5142 for(i = 0; i < lSeqLength; i++)
5143 printf("%c", *(tempref-1-i));
5144 printf("\n");
5145 for(i = 0; i < lSeqLength; i++)
5146 printf("%c", *(lSeq+i));
5147 printf("\n");
5148
5149 for(i = 0; i < rSeqLength; i++)
5150 printf("%c", *(tempref+segLength+i));
5151 printf("\n");
5152
5153 for(i = 0; i < rSeqLength; i++)
5154 printf("%c", *(rSeq+i));
5155 printf("\n");
5156
5157
5158 printf("SSEF=%d SSEB%d\n", error2, error3);
5159 printf("F=%d B=%d\n", error, error1);
5160 scanf("%d", &i);
5161 }
5162
5163 char matrixR[200];
5164 char matrixL[200];
5165
5166 matrixR[0] = '\0';
5167 matrixL[0] = '\0';
5168
5169 size = 0;
5170 directionIndex = rSeqLength;
5171 rIndex = minIndex2;
5172
5173 while(directionIndex != 0 || rIndex != 0)
5174 {
5175 if(directionIndex-rIndex == errThreshold)
5176 {
5177 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1)
5178 {
5179 matrixR[size] = *(rSeq+directionIndex-1);
5180 size++;
5181 matrixR[size] = 'I';
5182 directionIndex--;
5183 }
5184 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
5185 {
5186 matrixR[size] = *(ref+rIndex-1);
5187 rIndex--;
5188 directionIndex--;
5189 }
5190 else
5191 {
5192 matrixR[size] = 'M';
5193 rIndex--;
5194 directionIndex--;
5195 }
5196
5197 }
5198 else if(rIndex - directionIndex == errThreshold)
5199 {
5200 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1)
5201 {
5202 matrixR[size] = *(ref+rIndex-1);
5203 size++;
5204 matrixR[size] = 'D';
5205 rIndex--;
5206 }
5207 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
5208 {
5209 matrixR[size] = *(ref+rIndex-1);
5210 rIndex--;
5211 directionIndex--;
5212 }
5213 else
5214 {
5215 matrixR[size] = 'M';
5216 rIndex--;
5217 directionIndex--;
5218 }
5219 }
5220 else
5221 {
5222 if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0)
5223 {
5224 matrixR[size] = *(rSeq+directionIndex-1);
5225 size++;
5226 matrixR[size] = 'I';
5227 directionIndex--;
5228 }
5229 else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0)
5230 {
5231 matrixR[size] = *(ref+rIndex-1);
5232 size++;
5233 matrixR[size] = 'D';
5234 rIndex--;
5235 }
5236 else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 )
5237 {
5238 matrixR[size] = *(ref+rIndex-1);
5239 rIndex--;
5240 directionIndex--;
5241 }
5242 else
5243 {
5244 matrixR[size] = 'M';
5245 rIndex--;
5246 directionIndex--;
5247 }
5248 }
5249 size++;
5250 }
5251 matrixR[size] = '\0';
5252
5253 size = 0;
5254 directionIndex = lSeqLength;
5255 rIndex = minIndex1;
5256
5257
5258 while(directionIndex != 0 || rIndex != 0)
5259 {
5260 if(directionIndex-rIndex == errThreshold)
5261 {
5262 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1)
5263 {
5264 matrixL[size] = 'I';
5265 size++;
5266 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
5267 directionIndex--;
5268 }
5269 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
5270 {
5271 matrixL[size] = *(tempref-rIndex);
5272 rIndex--;
5273 directionIndex--;
5274 }
5275 else
5276 {
5277 matrixL[size] = 'M';
5278 rIndex--;
5279 directionIndex--;
5280 }
5281
5282 }
5283 else if(rIndex - directionIndex == errThreshold)
5284 {
5285 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1)
5286 {
5287 matrixL[size] = 'D';
5288 size++;
5289 matrixL[size] = *(tempref-rIndex);
5290 rIndex--;
5291 }
5292 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
5293 {
5294 matrixL[size] = *(tempref-rIndex);
5295 rIndex--;
5296 directionIndex--;
5297 }
5298 else
5299 {
5300 matrixL[size] = 'M';
5301 rIndex--;
5302 directionIndex--;
5303 }
5304 }
5305 else
5306 {
5307 if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0)
5308 {
5309 matrixL[size] = 'I';
5310 size++;
5311 matrixL[size] = *(lSeq+lSeqLength-directionIndex);
5312 directionIndex--;
5313 }
5314 else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0)
5315 {
5316 matrixL[size] = 'D';
5317 size++;
5318 matrixL[size] = *(tempref-rIndex);
5319 rIndex--;
5320 }
5321 else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 )
5322 {
5323 matrixL[size] = *(tempref-rIndex);
5324 rIndex--;
5325 directionIndex--;
5326 }
5327 else
5328 {
5329 matrixL[size] = 'M';
5330 rIndex--;
5331 directionIndex--;
5332 }
5333 }
5334 size++;
5335 }
5336 matrixL[size] = '\0';
5337 char middle[200];
5338 middle[0] = '\0';
5339 for(i = 0; i < segLength; i++)
5340 middle[i] = 'M';
5341 middle[segLength] = '\0';
5342
5343 char rmatrixR[200];
5344
5345 reverse(matrixR, rmatrixR, strlen(matrixR));
5346
5347 sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR);
5348
5349 return totalError;
5350 }
5351
5352
5353 int addCigarSize(int cnt){
5354 if (cnt<10) return 1;
5355 else if (cnt < 100) return 2;
5356 return 3;
5357 }
5358
5359 /*
5360 Generate Cigar from the back tracking matrix
5361 */
5362 void generateCigar(char *matrix, int matrixLength, char *cigar)
5363 {
5364 int i = 0;
5365
5366 int counterM=0;
5367 int counterI=0;
5368 int counterD=0;
5369
5370 int cigarSize = 0;
5371
5372 cigar[0] = '\0';
5373
5374 while(i < matrixLength)
5375 {
5376 if(matrix[i]=='M')
5377 {
5378 counterM++;
5379 if(counterI != 0)
5380 {
5381 sprintf(cigar, "%s%dI", cigar, counterI);
5382 cigarSize += addCigarSize(counterI) + 1;
5383 cigar[cigarSize] = '\0';
5384 counterI=0;
5385 }
5386 else if(counterD != 0)
5387 {
5388 sprintf(cigar, "%s%dD", cigar, counterD);
5389 cigarSize += addCigarSize(counterD) + 1;
5390 cigar[cigarSize] = '\0';
5391 counterD=0;
5392 }
5393 }
5394 else if(matrix[i] == 'I')
5395 {
5396 if(counterM != 0)
5397 {
5398 sprintf(cigar, "%s%dM", cigar, counterM);
5399 cigarSize += addCigarSize(counterM) + 1;
5400 cigar[cigarSize] = '\0';
5401 counterM = 0;
5402 }
5403 else if(counterD != 0)
5404 {
5405 sprintf(cigar, "%s%dD", cigar, counterD);
5406 cigarSize += addCigarSize(counterD) + 1;
5407 cigar[cigarSize] = '\0';
5408 counterD=0;
5409 }
5410 counterI++;
5411 i++;
5412
5413 }
5414 else if (matrix[i] == 'D')
5415 {
5416 if(counterM != 0)
5417 {
5418 sprintf(cigar, "%s%dM", cigar, counterM);
5419 cigarSize += addCigarSize(counterM) + 1;
5420 cigar[cigarSize] = '\0';
5421 counterM = 0;
5422 }
5423 else if(counterI != 0)
5424 {
5425 sprintf(cigar, "%s%dI", cigar, counterI);
5426 cigarSize += addCigarSize(counterI) + 1;
5427 cigar[cigarSize] = '\0';
5428 counterI=0;
5429 }
5430
5431 counterD++;
5432 i++;
5433
5434 }
5435 else
5436 {
5437 counterM++;
5438 if(counterI != 0)
5439 {
5440 sprintf(cigar, "%s%dI", cigar, counterI);
5441 cigarSize += addCigarSize(counterI) + 1;
5442 cigar[cigarSize] = '\0';
5443 counterI=0;
5444 }
5445 else if(counterD != 0)
5446 {
5447 sprintf(cigar, "%s%dD", cigar, counterD);
5448 cigarSize += addCigarSize(counterD) + 1;
5449 cigar[cigarSize] = '\0';
5450 counterD=0;
5451 }
5452 }
5453 i++;
5454 }
5455
5456 if(counterM != 0)
5457 {
5458 sprintf(cigar, "%s%dM", cigar, counterM);
5459 cigarSize += addCigarSize(counterM) + 1;
5460 cigar[cigarSize] = '\0';
5461 counterM = 0;
5462 }
5463 else if(counterI != 0)
5464 {
5465 sprintf(cigar, "%s%dI", cigar, counterI);
5466 cigarSize += addCigarSize(counterI) + 1;
5467 cigar[cigarSize] = '\0';
5468 counterI = 0;
5469 }
5470 else if(counterD != 0)
5471 {
5472 sprintf(cigar, "%s%dD", cigar, counterD);
5473 cigarSize += addCigarSize(counterD) + 1;
5474 cigar[cigarSize] = '\0';
5475 counterD = 0;
5476 }
5477
5478 cigar[cigarSize] = '\0';
5479 }
5480
5481 /*
5482 Creates the Cigar output from the mismatching positions format [0-9]+(([ACTGN]|\^[ACTGN]+)[0-9]+)*
5483 */
5484 void generateCigarFromMD(char *mismatch, int mismatchLength, char *cigar)
5485 {
5486 int i = 0;
5487 int j = 0;
5488
5489 int start = 0;
5490 int cigarSize = 0;
5491
5492 cigar[0] = '\0';
5493
5494 while(i < mismatchLength)
5495 {
5496 if(mismatch[i] >= '0' && mismatch[i] <= '9')
5497 {
5498 start = i;
5499
5500 while(mismatch[i] >= '0' && mismatch[i] <= '9' && i < mismatchLength)
5501 i++;
5502
5503 int value = atoi(mismatch+start);
5504 for(j = 0; j < value-1; j++)
5505 {
5506 cigar[cigarSize] = 'M';
5507 cigarSize++;
5508 }
5509 cigar[cigarSize] = 'M';
5510 }
5511 else if(mismatch[i] == '^')
5512 {
5513 cigar[cigarSize] = 'I';
5514 i++;
5515 }
5516 else if(mismatch[i] == '\'')
5517 {
5518 cigar[cigarSize] = 'D';
5519 i++;
5520 }
5521 else
5522 {
5523 cigar[cigarSize] = 'M';
5524 cigarSize++;
5525 }
5526 cigarSize++;
5527 i++;
5528 }
5529 cigar[cigarSize] = '\0';
5530 }
5531
5532 void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP)
5533 {
5534
5535 int i = 0;
5536
5537 int counterM = 0;
5538 int counterD = 0;
5539
5540 char delete[100];
5541
5542 int snpSize = 0;
5543
5544 outputSNP[0] = '\0';
5545 delete[0] = '\0';
5546
5547
5548 while(i < matrixLength)
5549 {
5550 if(matrix[i]=='M')
5551 {
5552 counterM++;
5553 if(counterD != 0)
5554 {
5555 delete[counterD] = '\0';
5556 counterD=0;
5557 sprintf(outputSNP, "%s^%s", outputSNP,delete);
5558 snpSize += strlen(delete) + 1;
5559 outputSNP[snpSize] = '\0';
5560 delete[0] = '\0';
5561 }
5562 }
5563 else if(matrix[i] == 'D')
5564 {
5565 if(counterM != 0)
5566 {
5567 sprintf(outputSNP, "%s%d", outputSNP, counterM);
5568 snpSize += addCigarSize(counterM);
5569 outputSNP[snpSize] = '\0';
5570 counterM=0;
5571 delete[counterD] = matrix[i+1];
5572 i++;
5573 counterD++;
5574 }
5575 else if(counterD != 0)
5576 {
5577 delete[counterD] = matrix[i+1];
5578 counterD++;
5579 i++;
5580 }
5581 else
5582 {
5583 delete[counterD] = matrix[i+1];
5584 counterD++;
5585 i++;
5586 }
5587 }
5588 else if(matrix[i] == 'I')
5589 {
5590 if(counterM != 0)
5591 {
5592 // sprintf(outputSNP, "%s%d\0", outputSNP, counterM);
5593 //counterM++;
5594 }
5595 else if(counterD != 0)
5596 {
5597 delete[counterD] = '\0';
5598 sprintf(outputSNP, "%s^%s", outputSNP, delete);
5599 snpSize += strlen(delete) + 1;
5600 outputSNP[snpSize] = '\0';
5601 counterD = 0;
5602 delete[0] = '\0';
5603 }
5604 i++;
5605
5606 }
5607 else
5608 {
5609 if(counterM != 0)
5610 {
5611 sprintf(outputSNP, "%s%d", outputSNP, counterM);
5612 snpSize += addCigarSize(counterM);
5613 outputSNP[snpSize] = '\0';
5614 counterM = 0;
5615 }
5616 if(counterD != 0)
5617 {
5618 delete[counterD] = '\0';
5619 counterD=0;
5620 sprintf(outputSNP, "%s^%s", outputSNP, delete);
5621 snpSize += strlen(delete) + 1;
5622 outputSNP[snpSize] = '\0';
5623 delete[0] = '\0';
5624 }
5625 sprintf(outputSNP,"%s%c",outputSNP,matrix[i]);
5626 snpSize += 1;
5627 outputSNP[snpSize] = '\0';
5628 }
5629 i++;
5630 }
5631
5632 if(counterM != 0)
5633 {
5634 sprintf(outputSNP, "%s%d", outputSNP, counterM);
5635 snpSize += addCigarSize(counterM);
5636 outputSNP[snpSize] = '\0';
5637 counterM = 0;
5638 }
5639 else if(counterD != 0)
5640 {
5641 delete[counterD] = '\0';
5642 sprintf(outputSNP, "%s^%s", outputSNP, delete);
5643 snpSize += strlen(delete) + 1;
5644 outputSNP[snpSize] = '\0';
5645 counterD = 0;
5646 }
5647
5648 outputSNP[snpSize] = '\0';
5649 }
5650 /**********************************************/
5651
5652 /*
5653 direction = 0 forward
5654 1 backward
5655
5656 */
5657
5658 void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction)
5659 {
5660 int j = 0;
5661 int z = 0;
5662 int *locs = (int *) l1;
5663 char *_tmpSeq, *_tmpQual;
5664 char rqual[SEQ_LENGTH+1];
5665 rqual[SEQ_LENGTH]='\0';
5666
5667 int genLoc = 0;
5668 int leftSeqLength = 0;
5669 int rightSeqLength = 0;
5670 int middleSeqLength = 0;
5671
5672 char matrix[200];
5673 char editString[200];
5674 char cigar[MAX_CIGAR_SIZE];
5675
5676 short *_tmpHashValue;
5677
5678 if (direction)
5679 {
5680 reverse(_msf_seqList[readNumber].qual, rqual, SEQ_LENGTH);
5681 _tmpQual = rqual;
5682 _tmpSeq = _msf_seqList[readNumber].rseq;
5683 _tmpHashValue = _msf_seqList[readNumber].rhashValue;
5684 }
5685 else
5686 {
5687 _tmpQual = _msf_seqList[readNumber].qual;
5688 _tmpSeq = _msf_seqList[readNumber].seq;
5689 _tmpHashValue = _msf_seqList[readNumber].hashValue;
5690 }
5691
5692 int readId = 2*readNumber+direction;
5693 for (z=0; z<s1; z++)
5694 {
5695
5696
5697 int map_location = 0;
5698 int a = 0;
5699 int o = readSegment;
5700
5701 genLoc = locs[z];//-_msf_samplingLocs[o];
5702
5703
5704 if ( genLoc-_msf_samplingLocs[o] < _msf_refGenBeg ||
5705 genLoc-_msf_samplingLocs[o] > _msf_refGenEnd ||
5706 _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId ||
5707 _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId
5708 )
5709 continue;
5710 int err = -1;
5711
5712
5713 map_location = 0;
5714
5715 leftSeqLength = _msf_samplingLocs[o];
5716 middleSeqLength = WINDOW_SIZE;
5717 a = leftSeqLength + middleSeqLength;
5718 rightSeqLength = SEQ_LENGTH - a;
5719
5720 if(errThreshold == 2)
5721 err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength,
5722 _tmpSeq + a, rightSeqLength,
5723 middleSeqLength, matrix, &map_location, _tmpHashValue);
5724 else if(errThreshold == 4)
5725 err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength,
5726 _tmpSeq + a, rightSeqLength,
5727 middleSeqLength, matrix, &map_location, _tmpHashValue);
5728 else if(errThreshold ==3)
5729 err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength,
5730 _tmpSeq + a, rightSeqLength,
5731 middleSeqLength, matrix, &map_location, _tmpHashValue);
5732 /*else if(errThreshold == 6)
5733 err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength,
5734 _tmpSeq + a, rightSeqLength,
5735 middleSeqLength, matrix, &map_location, _tmpHashValue);
5736 */
5737 else
5738 err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength,
5739 _tmpSeq + a, rightSeqLength,
5740 middleSeqLength, matrix, &map_location, _tmpHashValue);
5741
5742 if(err != -1)
5743 {
5744 generateSNPSAM(matrix, strlen(matrix), editString);
5745 generateCigar(matrix, strlen(matrix), cigar);
5746 }
5747
5748 if(err != -1 && !bestMode)
5749 {
5750
5751 mappingCnt++;
5752
5753 int j = 0;
5754 int k = 0;
5755 for(k = 0; k < readSegment+1; k++)
5756 {
5757 for(j = -errThreshold ; j <= errThreshold; j++)
5758 {
5759 if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg &&
5760 genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd)
5761 _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId;
5762 }
5763 }
5764 _msf_seqList[readNumber].hits[0]++;
5765
5766 _msf_output.QNAME = _msf_seqList[readNumber].name;
5767 _msf_output.FLAG = 16 * direction;
5768 _msf_output.RNAME = _msf_refGenName;
5769 _msf_output.POS = map_location + _msf_refGenOffset;
5770 _msf_output.MAPQ = 255;
5771 _msf_output.CIGAR = cigar;
5772 _msf_output.MRNAME = "*";
5773 _msf_output.MPOS = 0;
5774 _msf_output.ISIZE = 0;
5775 _msf_output.SEQ = _tmpSeq;
5776 _msf_output.QUAL = _tmpQual;
5777
5778 _msf_output.optSize = 2;
5779 _msf_output.optFields = _msf_optionalFields;
5780
5781 _msf_optionalFields[0].tag = "NM";
5782 _msf_optionalFields[0].type = 'i';
5783 _msf_optionalFields[0].iVal = err;
5784
5785 _msf_optionalFields[1].tag = "MD";
5786 _msf_optionalFields[1].type = 'Z';
5787 _msf_optionalFields[1].sVal = editString;
5788
5789 output(_msf_output);
5790
5791
5792 if (_msf_seqList[readNumber].hits[0] == 1)
5793 {
5794 mappedSeqCnt++;
5795 }
5796
5797 if ( maxHits == 0 )
5798 {
5799 _msf_seqList[readNumber].hits[0] = 2;
5800 }
5801
5802
5803 if ( maxHits!=0 && _msf_seqList[readNumber].hits[0] == maxHits)
5804 {
5805 completedSeqCnt++;
5806 break;
5807 }
5808
5809 }
5810 else if(err != -1 && bestMode)
5811 {
5812 mappingCnt++;
5813 _msf_seqList[readNumber].hits[0]++;
5814
5815 if (_msf_seqList[readNumber].hits[0] == 1)
5816 {
5817 mappedSeqCnt++;
5818 }
5819
5820 if ( maxHits == 0 )
5821 {
5822 _msf_seqList[readNumber].hits[0] = 2;
5823 }
5824
5825 if(err < bestHitMappingInfo[readNumber].err || bestHitMappingInfo[readNumber].loc == -1)
5826 {
5827 setFullMappingInfo(readNumber, map_location + _msf_refGenOffset, direction, err, 0, editString, _msf_refGenName, cigar );
5828 }
5829 }
5830 else
5831 {
5832 for(j = -errThreshold ; j <= errThreshold; j++)
5833 {
5834 if(genLoc+j > _msf_refGenBeg &&
5835 genLoc+j < _msf_refGenEnd)
5836 _msf_verifiedLocs[genLoc+j] = -readId;
5837 }
5838 }
5839 }
5840 }
5841
5842
5843 int mapAllSingleEndSeq()
5844 {
5845 int i = 0;
5846 int j = 0;
5847 int k = 0;
5848
5849
5850 unsigned int *locs = NULL;
5851
5852
5853 int prev_hash = 0;
5854
5855 for(i = 0; i < _msf_seqListSize; i++)
5856 {
5857 for(j = 0; j < _msf_samplingLocsSize; j++)
5858 {
5859 k = _msf_sort_seqList[i].readNumber;
5860 // if(j != 0)
5861 // if(strncmp(_msf_seqList[k].seq+_msf_samplingLocs[j], _msf_seqList[k].seq+_msf_samplingLocs[j-1], segSize) == 0)
5862 // continue;
5863 // if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]))
5864 // continue;
5865 locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]));
5866 if ( locs != NULL)
5867 {
5868 mapSingleEndSeq(locs+1, locs[0],k ,j, 0);
5869 }
5870 prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]);
5871 }
5872 }
5873 i = 0;
5874
5875 for(i = 0; i < _msf_seqListSize; i++)
5876 {
5877 for(j = 0; j < _msf_samplingLocsSize; j++)
5878 {
5879 k = _msf_sort_seqList[i].readNumber;
5880
5881 // if(j != 0)
5882 // if(strncmp(_msf_seqList[k].rseq+_msf_samplingLocs[j], _msf_seqList[k].rseq+_msf_samplingLocs[j-1], segSize) == 0)
5883 // continue;
5884 // if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]))
5885 // continue;
5886 locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j]));
5887 if ( locs != NULL)
5888 {
5889 mapSingleEndSeq(locs+1, locs[0],k ,j, 1);
5890 }
5891 prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]);
5892 }
5893 }
5894 return 1;
5895 }
5896
5897
5898 /**********************************************/
5899 /**********************************************/
5900 /**********************************************/
5901 /**********************************************/
5902 /**********************************************/
5903 int compareOut (const void *a, const void *b)
5904 {
5905 FullMappingInfo *aInfo = (FullMappingInfo *)a;
5906 FullMappingInfo *bInfo = (FullMappingInfo *)b;
5907 return aInfo->loc - bInfo->loc;
5908 }
5909
5910
5911
5912 /**********************************************/
5913
5914 /*
5915 direction 0: Forward
5916 1: Reverse
5917 */
5918
5919 void mapPairEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction)
5920 {
5921 int z = 0;
5922 int *locs = (int *) l1;
5923 char *_tmpSeq;
5924
5925 char rqual[SEQ_LENGTH+1];
5926
5927 char matrix[200];
5928 char editString[200];
5929 char cigar[MAX_CIGAR_SIZE];
5930
5931 short *_tmpHashValue;
5932
5933 int leftSeqLength = 0;
5934 int middleSeqLength = 0;
5935 int rightSeqLength =0;
5936 int a = 0;
5937
5938 rqual[SEQ_LENGTH]='\0';
5939
5940
5941 int r = readNumber;
5942
5943 char d = (direction==1)?-1:1;
5944
5945 if (d==-1)
5946 {
5947 _tmpSeq = _msf_seqList[readNumber].rseq;
5948 _tmpHashValue = _msf_seqList[r].rhashValue;
5949 }
5950 else
5951 {
5952 _tmpSeq = _msf_seqList[readNumber].seq;
5953 _tmpHashValue = _msf_seqList[r].hashValue;
5954 }
5955
5956 int readId = 2*readNumber+direction;
5957 for (z=0; z<s1; z++)
5958 {
5959 int genLoc = locs[z];//-_msf_samplingLocs[o];
5960 int err = -1;
5961 int map_location = 0;
5962 int o = readSegment;
5963
5964 leftSeqLength = _msf_samplingLocs[o];
5965 middleSeqLength = WINDOW_SIZE;
5966 a = leftSeqLength + middleSeqLength;
5967 rightSeqLength = SEQ_LENGTH - a;
5968
5969 if(genLoc - leftSeqLength < _msf_refGenBeg || genLoc + rightSeqLength + middleSeqLength > _msf_refGenEnd ||
5970 _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId)
5971 continue;
5972
5973 if(errThreshold == 2)
5974 err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength,
5975 _tmpSeq + a, rightSeqLength,
5976 middleSeqLength, matrix, &map_location, _tmpHashValue);
5977 else if(errThreshold == 4)
5978 err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength,
5979 _tmpSeq + a, rightSeqLength,
5980 middleSeqLength, matrix, &map_location, _tmpHashValue);
5981 else if(errThreshold ==3)
5982 err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength,
5983 _tmpSeq + a, rightSeqLength,
5984 middleSeqLength, matrix, &map_location, _tmpHashValue);
5985 /*else if(errThreshold == 6)
5986 err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength,
5987 _tmpSeq + a, rightSeqLength,
5988 middleSeqLength, matrix, &map_location, _tmpHashValue);*/
5989 else
5990 err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength,
5991 _tmpSeq + a, rightSeqLength,
5992 middleSeqLength, matrix, &map_location, _tmpHashValue);
5993
5994
5995 if (err != -1)
5996 {
5997 int j = 0;
5998 int k = 0;
5999
6000 for(k = 0; k < readSegment+1; k++)
6001 {
6002 for(j = -errThreshold ; j <= errThreshold; j++)
6003 {
6004 if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg &&
6005 genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd)
6006 _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId;
6007 }
6008 }
6009
6010
6011 generateSNPSAM(matrix, strlen(matrix), editString);
6012 generateCigar(matrix, strlen(matrix), cigar);
6013
6014 MappingLocations *parent = NULL;
6015 MappingLocations *child = _msf_mappingInfo[r].next;
6016
6017 genLoc = map_location + _msf_refGenOffset;
6018 int i = 0;
6019 for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++)
6020 {
6021 parent = child;
6022 child = child->next;
6023 }
6024
6025 if (child==NULL)
6026 {
6027 MappingLocations *tmp = getMem(sizeof(MappingLocations));
6028
6029 tmp->next = NULL;
6030 tmp->loc[0]=genLoc * d;
6031 tmp->err[0]=err;
6032
6033 tmp->cigarSize[0] = strlen(cigar);
6034 sprintf(tmp->cigar[0],"%s", cigar);
6035
6036 tmp->mdSize[0] = strlen(editString);
6037 sprintf(tmp->md[0],"%s", editString);
6038
6039 if (parent == NULL)
6040 _msf_mappingInfo[r].next = tmp;
6041 else
6042 parent->next = tmp;
6043 }
6044 else
6045 {
6046 if(strlen(cigar) > SEQ_LENGTH || strlen(editString) > SEQ_LENGTH)
6047 {
6048 printf("ERROR in %d read size(After mapping) exceedes cigar=%d md =%d cigar=%s md =%s\n", r, (int)strlen(cigar), (int)strlen(editString), cigar, editString);
6049 }
6050
6051 child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d;
6052 child->err[_msf_mappingInfo[r].size % MAP_CHUNKS] = err;
6053
6054 child->cigarSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(cigar);
6055 sprintf(child->cigar[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",cigar);
6056
6057 child->mdSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(editString);
6058 sprintf(child->md[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",editString);
6059 }
6060 _msf_mappingInfo[r].size++;
6061
6062 }
6063 else
6064 {
6065 _msf_verifiedLocs[genLoc] = -readId;
6066 }
6067
6068 }
6069 }
6070
6071 /**********************************************/
6072 void mapPairedEndSeq()
6073 {
6074 int i = 0;
6075 int j = 0;
6076 int k = 0;
6077
6078 unsigned int *locs = NULL;
6079 while ( i < _msf_seqListSize )
6080 {
6081 for(j = 0; j < _msf_samplingLocsSize; j++)
6082 {
6083 k = _msf_sort_seqList[i].readNumber;
6084 locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]));
6085 if ( locs != NULL)
6086 {
6087 mapPairEndSeqList(locs+1, locs[0],k ,j, 0);
6088 }
6089 }
6090 i++;
6091 }
6092 i = 0;
6093
6094 while ( i < _msf_seqListSize )
6095 {
6096 for(j = 0; j < _msf_samplingLocsSize; j++)
6097 {
6098 k = _msf_sort_seqList[i].readNumber;
6099 locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j]));
6100 if ( locs != NULL)
6101 {
6102 mapPairEndSeqList(locs+1, locs[0],k ,j, 1);
6103 }
6104 }
6105
6106 i++;
6107 }
6108 char fname1[FILE_NAME_LENGTH];
6109 char fname2[FILE_NAME_LENGTH];
6110 MappingLocations *cur;
6111 int tmpOut;
6112 int lmax=0, rmax=0;
6113
6114 sprintf(fname1, "%s__%s__%s__%d__1.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles);
6115 sprintf(fname2, "%s__%s__%s__%d__2.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles);
6116
6117 FILE* out;
6118 FILE* out1 = fileOpen(fname1, "w");
6119 FILE* out2 = fileOpen(fname2, "w");
6120
6121 _msf_openFiles++;
6122
6123 for (i=0; i<_msf_seqListSize; i++)
6124 {
6125
6126 if (i%2==0)
6127 {
6128 out = out1;
6129
6130 if (lmax < _msf_mappingInfo[i].size)
6131 {
6132 lmax = _msf_mappingInfo[i].size;
6133 }
6134 }
6135 else
6136 {
6137 out = out2;
6138 if (rmax < _msf_mappingInfo[i].size)
6139 {
6140 rmax = _msf_mappingInfo[i].size;
6141 }
6142 }
6143
6144 tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out);
6145 if (_msf_mappingInfo[i].size > 0)
6146 {
6147 cur = _msf_mappingInfo[i].next;
6148 for (j=0; j < _msf_mappingInfo[i].size; j++)
6149 {
6150 if ( j>0 && j%MAP_CHUNKS==0)
6151 {
6152 cur = cur->next;
6153 }
6154 if(cur->cigarSize[j % MAP_CHUNKS] > SEQ_LENGTH || cur->mdSize[j % MAP_CHUNKS] > SEQ_LENGTH)
6155 {
6156 printf("ERROR in %d read size exceeds cigar=%d md =%d cigar=%s md =%s\n", i, cur->cigarSize[j % MAP_CHUNKS], cur->mdSize[j % MAP_CHUNKS], cur->cigar[j % MAP_CHUNKS], cur->md[j % MAP_CHUNKS]);
6157 }
6158
6159 tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out);
6160
6161 tmpOut = fwrite(&(cur->err[j % MAP_CHUNKS]), sizeof(int), 1, out);
6162
6163 tmpOut = fwrite(&(cur->cigarSize[j % MAP_CHUNKS]), sizeof(int), 1, out);
6164 tmpOut = fwrite((cur->cigar[j % MAP_CHUNKS]), sizeof(char), (cur->cigarSize[j % MAP_CHUNKS]), out);
6165
6166 tmpOut = fwrite(&(cur->mdSize[j % MAP_CHUNKS]), sizeof(int), 1, out);
6167 tmpOut = fwrite((cur->md[j % MAP_CHUNKS]), sizeof(char), (cur->mdSize[j % MAP_CHUNKS]), out);
6168
6169 }
6170 _msf_mappingInfo[i].size = 0;
6171 //_msf_mappingInfo[i].next = NULL;
6172 }
6173 }
6174
6175 _msf_maxLSize += lmax;
6176 _msf_maxRSize += rmax;
6177
6178 fclose(out1);
6179 fclose(out2);
6180
6181 }
6182
6183 void outputPairFullMappingInfo(FILE *fp, int readNumber)
6184 {
6185
6186 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2;
6187 char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1];
6188
6189 rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0';
6190
6191 seq1 = _msf_seqList[readNumber*2].seq;
6192 rseq1 = _msf_seqList[readNumber*2].rseq;
6193 qual1 = _msf_seqList[readNumber*2].qual;
6194
6195 reverse(_msf_seqList[readNumber*2].qual, rqual1, SEQ_LENGTH);
6196
6197 seq2 = _msf_seqList[readNumber*2+1].seq;
6198 rseq2 = _msf_seqList[readNumber*2+1].rseq;
6199 qual2 = _msf_seqList[readNumber*2+1].qual;
6200
6201 reverse(_msf_seqList[readNumber*2+1].qual, rqual2, SEQ_LENGTH);
6202
6203
6204 if(bestHitMappingInfo[readNumber*2].loc == -1 && bestHitMappingInfo[readNumber*2+1].loc == -1)
6205 return;
6206 else
6207 {
6208
6209 char *seq;
6210 char *qual;
6211 char d1;
6212 char d2;
6213 int isize;
6214 int proper=0;
6215 // ISIZE CALCULATION
6216 // The distance between outer edges
6217 isize = abs(bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc)+SEQ_LENGTH - 2;
6218
6219 if (bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc > 0)
6220 {
6221 isize *= -1;
6222 }
6223 d1 = (bestHitMappingInfo[readNumber*2].dir == -1)?1:0;
6224 d2 = (bestHitMappingInfo[readNumber*2+1].dir == -1)?1:0;
6225
6226 if ( d1 )
6227 {
6228 seq = rseq1;
6229 qual = rqual1;
6230 }
6231 else
6232 {
6233 seq = seq1;
6234 qual = qual1;
6235 }
6236 if ( (bestHitMappingInfo[readNumber*2].loc < bestHitMappingInfo[readNumber*2+1].loc && !d1 && d2) ||
6237 (bestHitMappingInfo[readNumber*2].loc > bestHitMappingInfo[readNumber*2+1].loc && d1 && !d2) )
6238 {
6239 proper = 2;
6240 }
6241 else
6242 {
6243 proper = 0;
6244 }
6245
6246 _msf_output.POS = bestHitMappingInfo[readNumber*2].loc;
6247 _msf_output.MPOS = bestHitMappingInfo[readNumber*2+1].loc;
6248 _msf_output.FLAG = 1+proper+16*d1+32*d2+64;
6249 _msf_output.ISIZE = isize;
6250 _msf_output.SEQ = seq,
6251 _msf_output.QUAL = qual;
6252 _msf_output.QNAME = _msf_seqList[readNumber*2].name;
6253 _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr;
6254 _msf_output.MAPQ = 255;
6255 _msf_output.CIGAR = bestHitMappingInfo[readNumber*2].cigar;
6256 _msf_output.MRNAME = "=";
6257
6258 _msf_output.optSize = 2;
6259 _msf_output.optFields = _msf_optionalFields;
6260
6261 _msf_optionalFields[0].tag = "NM";
6262 _msf_optionalFields[0].type = 'i';
6263 _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2].err;
6264
6265 _msf_optionalFields[1].tag = "MD";
6266 _msf_optionalFields[1].type = 'Z';
6267 _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2].md;
6268
6269 outputSAM(fp, _msf_output);
6270 output(_msf_output);
6271
6272 if ( d2 )
6273 {
6274 seq = rseq2;
6275 qual = rqual2;
6276 }
6277 else
6278 {
6279 seq = seq2;
6280 qual = qual2;
6281 }
6282
6283 _msf_output.POS = bestHitMappingInfo[readNumber*2+1].loc;
6284 _msf_output.MPOS = bestHitMappingInfo[readNumber*2].loc;
6285 _msf_output.FLAG = 1+proper+16*d2+32*d1+128;
6286 _msf_output.ISIZE = -isize;
6287 _msf_output.SEQ = seq,
6288 _msf_output.QUAL = qual;
6289 _msf_output.QNAME = _msf_seqList[readNumber*2].name;
6290 _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr;
6291 _msf_output.MAPQ = 255;
6292 _msf_output.CIGAR = bestHitMappingInfo[readNumber*2+1].cigar;
6293 _msf_output.MRNAME = "=";
6294
6295 _msf_output.optSize = 2;
6296 _msf_output.optFields = _msf_optionalFields;
6297
6298 _msf_optionalFields[0].tag = "NM";
6299 _msf_optionalFields[0].type = 'i';
6300 _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2+1].err;
6301
6302 _msf_optionalFields[1].tag = "MD";
6303 _msf_optionalFields[1].type = 'Z';
6304 _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2+1].md;
6305
6306 outputSAM(fp, _msf_output);
6307 output(_msf_output);
6308 }
6309 }
6310
6311
6312 /*
6313 Find the closet one to the c
6314 @return 0: if the x1 is closer to c
6315 1: if the x2 is closer to c
6316 2: if both distance are equal
6317 -1: if error
6318 */
6319 int findNearest(int x1, int x2, int c)
6320 {
6321
6322 if (abs(x1 - c) > abs(x2 - c) )
6323 return 0;
6324 else if ( abs(x1 - c) < abs(x2 - c) )
6325 return 1;
6326 else if ( abs(x1 - c) == abs(x2 - c) )
6327 return 2;
6328 else
6329 return -1;
6330 }
6331
6332 void initBestConcordantDiscordant(int readNumber)
6333 {
6334 char bestConcordantFileName[FILE_NAME_LENGTH];
6335 char bestDiscordantFileName[FILE_NAME_LENGTH];
6336
6337 //OPEN THE BEST CONCORDANT FILE
6338 //BEGIN{Farhad Hormozdiari}
6339 sprintf(bestConcordantFileName, "%s%s__BEST.CONCORDANT", mappingOutputPath, mappingOutput);
6340 bestConcordantFILE = fileOpen(bestConcordantFileName, "w");
6341 //END{Farhad Hormozdiari}
6342
6343
6344 //OPEN THE BEST DISCORDANT FILE
6345 //BEGIN{Farhad Hormozdiari}
6346 sprintf(bestDiscordantFileName, "%s%s__BEST.DISCORDANT", mappingOutputPath, mappingOutput);
6347 bestDiscordantFILE = fileOpen(bestDiscordantFileName, "w");
6348 //END{Farhad Hormozdiari}
6349
6350 initBestMapping(readNumber);
6351 }
6352
6353 void finalizeBestConcordantDiscordant()
6354 {
6355 int i = 0;
6356
6357 for(i = 0; i<_msf_seqListSize/2; i++)
6358 {
6359 if(_msf_readHasConcordantMapping[i]==1)
6360 outputPairFullMappingInfo(bestConcordantFILE, i);
6361 else
6362 outputPairFullMappingInfo(bestDiscordantFILE, i);
6363 }
6364
6365 fclose(bestConcordantFILE);
6366 fclose(bestDiscordantFILE);
6367
6368 freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo));
6369 }
6370
6371 void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar)
6372 {
6373 bestHitMappingInfo[readNumber].loc = loc;
6374 bestHitMappingInfo[readNumber].dir = dir;
6375 bestHitMappingInfo[readNumber].err = err;
6376 bestHitMappingInfo[readNumber].score = score;
6377
6378 strncpy(bestHitMappingInfo[readNumber].md, md, strlen(md));
6379 strncpy(bestHitMappingInfo[readNumber].chr, refName, strlen(refName));
6380 strncpy(bestHitMappingInfo[readNumber].cigar, cigar, strlen(cigar));
6381 }
6382
6383
6384 void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2)
6385 {
6386
6387 bestHitMappingInfo[readNumber*2].loc = mi1.loc;
6388 bestHitMappingInfo[readNumber*2].dir = mi1.dir;
6389 bestHitMappingInfo[readNumber*2].err = mi1.err;
6390 bestHitMappingInfo[readNumber*2].score = mi1.score;
6391 snprintf(bestHitMappingInfo[readNumber*2].chr, MAX_REF_SIZE, "%s", _msf_refGenName);
6392
6393 strncpy(bestHitMappingInfo[readNumber*2].md, mi1.md, strlen(mi1.md));
6394 strncpy(bestHitMappingInfo[readNumber*2].cigar, mi1.cigar, strlen(mi1.cigar));
6395
6396 /*
6397 sprintf(bestHitMappingInfo[readNumber*2].md, "%s\0", mi1.md);
6398 sprintf(bestHitMappingInfo[readNumber*2].cigar, "%s\0", mi1.cigar);
6399 */
6400
6401
6402 bestHitMappingInfo[readNumber*2+1].loc = mi2.loc;
6403 bestHitMappingInfo[readNumber*2+1].dir = mi2.dir;
6404 bestHitMappingInfo[readNumber*2+1].err = mi2.err;
6405 bestHitMappingInfo[readNumber*2+1].score = mi2.score;
6406
6407 snprintf(bestHitMappingInfo[readNumber*2+1].chr, MAX_REF_SIZE, "%s", _msf_refGenName);
6408
6409 /*
6410 sprintf(bestHitMappingInfo[readNumber*2+1].md, "%s\0", mi2.md);
6411 sprintf(bestHitMappingInfo[readNumber*2+1].cigar, "%s\0", mi2.cigar);
6412 */
6413
6414 strncpy(bestHitMappingInfo[readNumber*2+1].md, mi2.md, strlen(mi2.md));
6415 strncpy(bestHitMappingInfo[readNumber*2+1].cigar, mi2.cigar, strlen(mi2.cigar));
6416
6417 }
6418
6419 /**********************************************/
6420 void outputPairedEnd()
6421 {
6422 int i = 0;
6423
6424 char cigar[MAX_CIGAR_SIZE];
6425
6426 int tmpOut;
6427
6428 loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut);
6429
6430 FILE* in1[_msf_openFiles];
6431 FILE* in2[_msf_openFiles];
6432
6433 char fname1[_msf_openFiles][FILE_NAME_LENGTH];
6434 char fname2[_msf_openFiles][FILE_NAME_LENGTH];
6435
6436 // discordant
6437 FILE *out=NULL, *out1=NULL;
6438
6439 char fname3[FILE_NAME_LENGTH];
6440 char fname4[FILE_NAME_LENGTH];
6441
6442 int meanDistanceMapping = 0;
6443
6444 char *rqual1;
6445 char *rqual2;
6446
6447 rqual1 = getMem((SEQ_LENGTH+1)*sizeof(char));
6448 rqual2 = getMem((SEQ_LENGTH+1)*sizeof(char));
6449
6450 if (pairedEndDiscordantMode)
6451 {
6452 sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput);
6453 sprintf(fname4, "%s__%s__oea", mappingOutputPath, mappingOutput);
6454 out = fileOpen(fname3, "a");
6455 out1 = fileOpen(fname4, "a");
6456 }
6457
6458 FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize);
6459 FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize);
6460
6461 _msf_fileCount[_msf_maxFile] = 0;
6462 for (i=0; i<_msf_openFiles; i++)
6463 {
6464 sprintf(fname1[i], "%s__%s__%s__%d__1.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i);
6465 sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][0], "%s", fname1[i]);
6466
6467 sprintf(fname2[i], "%s__%s__%s__%d__2.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i);
6468 sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][1], "%s", fname2[i]);
6469
6470 in1[i] = fileOpen(fname1[i], "r");
6471 in2[i] = fileOpen(fname2[i], "r");
6472 _msf_fileCount[_msf_maxFile]++;
6473 }
6474 _msf_maxFile++;
6475
6476 int size;
6477 int j, k;
6478 int size1, size2;
6479
6480 meanDistanceMapping = (pairedEndDiscordantMode==1)? (minPairEndedDiscordantDistance+maxPairEndedDiscordantDistance)/2 + SEQ_LENGTH
6481 : (minPairEndedDistance + maxPairEndedDistance) / 2 + SEQ_LENGTH;
6482
6483 for (i=0; i<_msf_seqListSize/2; i++)
6484 {
6485 size1 = size2 = 0;
6486 for (j=0; j<_msf_openFiles; j++)
6487 {
6488 tmpOut = fread(&size, sizeof(int), 1, in1[j]);
6489 if ( size > 0 )
6490 {
6491 for (k=0; k<size; k++)
6492 {
6493 mi1[size1+k].dir = 1;
6494 tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]);
6495 tmpOut = fread (&(mi1[size1+k].err), sizeof(int), 1, in1[j]);
6496
6497 tmpOut = fread (&(mi1[size1+k].cigarSize), sizeof(int), 1, in1[j]);
6498 tmpOut = fread ((mi1[size1+k].cigar), sizeof(char), mi1[size1+k].cigarSize, in1[j]);
6499 mi1[size1+k].cigar[mi1[size1+k].cigarSize] = '\0';
6500
6501 tmpOut = fread (&(mi1[size1+k].mdSize), sizeof(int), 1, in1[j]);
6502 tmpOut = fread ((mi1[size1+k].md), sizeof(char), (mi1[size1+k].mdSize), in1[j]);
6503 mi1[size1+k].md[mi1[size1+k].mdSize] = '\0';
6504
6505 if (mi1[size1+k].loc<1)
6506 {
6507 mi1[size1+k].loc *= -1;
6508 mi1[size1+k].dir = -1;
6509 }
6510 }
6511 qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut);
6512 size1+=size;
6513 }
6514 }
6515
6516 for (j=0; j<_msf_openFiles; j++)
6517 {
6518 tmpOut = fread(&size, sizeof(int), 1, in2[j]);
6519 if ( size > 0 )
6520 {
6521 for (k=0; k<size; k++)
6522 {
6523 mi2[size2+k].dir = 1;
6524 tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]);
6525 tmpOut = fread (&(mi2[size2+k].err), sizeof(int), 1, in2[j]);
6526
6527 tmpOut = fread (&(mi2[size2+k].cigarSize), sizeof(int), 1, in2[j]);
6528 tmpOut = fread ((mi2[size2+k].cigar), sizeof(char), mi2[size2+k].cigarSize, in2[j]);
6529 mi2[size2+k].cigar[mi2[size2+k].cigarSize] = '\0';
6530
6531 tmpOut = fread (&(mi2[size2+k].mdSize), sizeof(int), 1, in2[j]);
6532 tmpOut = fread ((mi2[size2+k].md), sizeof(char), mi2[size2+k].mdSize, in2[j]);
6533 mi2[size2+k].md[mi2[size2+k].mdSize] = '\0';
6534
6535 if (mi2[size2+k].loc<1)
6536 {
6537 mi2[size2+k].loc *= -1;
6538 mi2[size2+k].dir = -1;
6539 }
6540 }
6541 qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut);
6542 size2+=size;
6543 }
6544 }
6545
6546 int lm, ll, rl, rm;
6547 int pos = 0;
6548
6549 if (pairedEndDiscordantMode)
6550 {
6551
6552 for (j=0; j<size1; j++)
6553 {
6554 lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1;
6555 ll = mi1[j].loc - minPairEndedDiscordantDistance + 1;
6556 rl = mi1[j].loc + minPairEndedDiscordantDistance - 1;
6557 rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1;
6558
6559 while (pos<size2 && mi2[pos].loc < lm)
6560 {
6561 pos++;
6562 }
6563
6564 k = pos;
6565 while (k<size2 && mi2[k].loc<=rm)
6566 {
6567 if ( mi2[k].loc <= ll || mi2[k].loc >= rl)
6568 {
6569 if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) ||
6570 (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) )
6571 {
6572 _msf_seqList[i*2].hits[0]=1;
6573 _msf_seqList[i*2+1].hits[0]=1;
6574
6575 if(nosamMode != 0)
6576 {
6577 size1=0;
6578 size2=0;
6579 }
6580
6581 break;
6582 }
6583 }
6584 k++;
6585 }
6586 }
6587
6588 _msf_seqHits[i*2] += size1;
6589 _msf_seqHits[i*2+1] += size2;
6590
6591
6592 if (_msf_seqHits[i*2+1] * _msf_seqHits[i*2] > DISCORDANT_CUT_OFF && nosamMode != 0)
6593 {
6594 _msf_seqList[i*2].hits[0]=1;
6595 _msf_seqList[i*2+1].hits[0]=1;
6596 size1=0;
6597 size2=0;
6598 }
6599
6600
6601
6602
6603 int tmp = 0;
6604 int rNo = 0;
6605 int loc = 0;
6606 int err = 0;
6607 float sc = 0;
6608 char l = 0;
6609
6610 //write the OEA data
6611 if(_msf_seqHits[i*2] == 0 )
6612 {
6613 for(k = 0;k < size2 && _msf_oeaMapping[i*2+1] < maxOEAOutput ;k++)
6614 {
6615 rNo = i*2+1;
6616 loc = mi2[k].loc*mi2[k].dir;
6617 err = mi2[k].err;
6618 sc = mi2[k].score;
6619
6620 l = strlen(_msf_refGenName);
6621
6622 tmp = fwrite(&rNo, sizeof(int), 1, out1);
6623
6624 tmp = fwrite(&l, sizeof(char), 1, out1);
6625 tmp = fwrite(_msf_refGenName, sizeof(char), l, out1);
6626
6627 tmp = fwrite(&loc, sizeof(int), 1, out1);
6628 tmp = fwrite(&err, sizeof(int), 1, out1);
6629 tmp = fwrite(&sc, sizeof(float), 1, out1);
6630
6631 if(mi2[k].cigarSize > SEQ_LENGTH || mi2[k].cigarSize <= 0)
6632 printf("ERROR CIGAR size=%d %s\n", mi2[k].cigarSize, _msf_seqList[i*2+1].seq);
6633
6634 tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out1);
6635 tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out1);
6636
6637 tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out1);
6638 tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out1);
6639
6640 _msf_oeaMapping[i*2+1]++;
6641 }
6642 }
6643 if(_msf_seqHits[i*2+1] == 0)
6644 {
6645 for(j = 0;j < size1 && _msf_oeaMapping[i*2] < maxOEAOutput;j++)
6646 {
6647 rNo = i*2;
6648 loc = mi1[j].loc*mi1[j].dir;
6649 err = mi1[j].err;
6650 sc = mi1[j].score;
6651
6652 l = strlen(_msf_refGenName);
6653
6654 tmp = fwrite(&rNo, sizeof(int), 1, out1);
6655
6656 tmp = fwrite(&l, sizeof(char), 1, out1);
6657 tmp = fwrite(_msf_refGenName, sizeof(char), l, out1);
6658
6659 tmp = fwrite(&loc, sizeof(int), 1, out1);
6660 tmp = fwrite(&err, sizeof(int), 1, out1);
6661 tmp = fwrite(&sc, sizeof(float), 1, out1);
6662
6663 if(mi1[j].cigarSize > SEQ_LENGTH || mi1[j].cigarSize <= 0 )
6664 printf("ERROR %d %s\n", mi1[j].cigarSize, _msf_seqList[i*2+1].seq);
6665
6666 tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out1);
6667 tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out1);
6668
6669 tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out1);
6670 tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out1);
6671
6672 _msf_oeaMapping[i*2]++;
6673 }
6674 }
6675 }
6676
6677 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2;
6678
6679
6680
6681
6682 rqual1[SEQ_LENGTH] = '\0';
6683 rqual2[SEQ_LENGTH] = '\0';
6684 rqual1[0] = '\0';
6685 rqual2[0] = '\0';
6686
6687
6688
6689 seq1 = _msf_seqList[i*2].seq;
6690 rseq1 = _msf_seqList[i*2].rseq;
6691 qual1 = _msf_seqList[i*2].qual;
6692
6693
6694
6695 strncpy(rqual1, _msf_seqList[i*2].qual, SEQ_LENGTH);
6696
6697 seq2 = _msf_seqList[i*2+1].seq;
6698 rseq2 = _msf_seqList[i*2+1].rseq;
6699 qual2 = _msf_seqList[i*2+1].qual;
6700
6701
6702 strncpy(rqual2, _msf_seqList[i*2+1].qual, SEQ_LENGTH);
6703
6704 if (pairedEndDiscordantMode)
6705 {
6706 for (k=0; k<size1; k++)
6707 {
6708 mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, mi1[k].cigar);
6709 }
6710
6711 for (k=0; k<size2; k++)
6712 {
6713 mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, mi2[k].cigar);
6714 }
6715
6716 }
6717
6718
6719 if (pairedEndDiscordantMode)
6720 {
6721 for (j=0; j<size1; j++)
6722 {
6723 for(k = 0; k < size2; k++)
6724 {
6725 if(
6726 (mi2[k].loc-mi1[j].loc >= minPairEndedDiscordantDistance &&
6727 mi2[k].loc-mi1[j].loc <= maxPairEndedDiscordantDistance &&
6728 mi1[j].dir > 0 && mi2[k].dir < 0 )
6729
6730 ||
6731
6732 (mi1[j].loc-mi2[k].loc >= minPairEndedDiscordantDistance &&
6733 mi1[j].loc-mi2[k].loc <= maxPairEndedDiscordantDistance &&
6734 mi1[j].dir < 0 && mi2[k].dir > 0)
6735 )
6736 {
6737 //POSSIBLE CONCORDANT
6738 if(_msf_readHasConcordantMapping[i] == 0)
6739 {
6740 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6741 _msf_readHasConcordantMapping[i] = 1;
6742 _msf_seqList[i*2].hits[0] = 1;
6743 _msf_seqList[i*2+1].hits[0] = 1;
6744 }
6745 else
6746 {
6747 if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err)
6748 {
6749
6750 if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err ==
6751 mi1[j].err + mi2[k].err &&
6752 findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc),
6753 abs(mi2[k].loc - mi1[j].loc),
6754 meanDistanceMapping
6755 ) == 0 )
6756 {
6757 continue;
6758 }
6759 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6760 }
6761 }
6762 }
6763 //DISCORDANT TO TEMP FILE FOR POST PROCESSIING
6764 else if(_msf_readHasConcordantMapping[i] == 0 &&
6765 _msf_seqHits[i*2] != 0 &&
6766 _msf_seqHits[i*2+1] != 0)
6767 {
6768
6769 int tmp;
6770 int rNo = i;
6771 int loc = mi1[j].loc*mi1[j].dir;
6772 int err = mi1[j].err;
6773 float sc = mi1[j].score;
6774
6775 char l = strlen(_msf_refGenName);
6776
6777 if(_msf_discordantMapping[i*2] < maxDiscordantOutput)
6778 {
6779
6780 tmp = fwrite(&rNo, sizeof(int), 1, out);
6781
6782 tmp = fwrite(&l, sizeof(char), 1, out);
6783 tmp = fwrite(_msf_refGenName, sizeof(char), l, out);
6784
6785 tmp = fwrite(&loc, sizeof(int), 1, out);
6786 tmp = fwrite(&err, sizeof(int), 1, out);
6787 tmp = fwrite(&sc, sizeof(float), 1, out);
6788
6789 tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out);
6790 tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out);
6791
6792 tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out);
6793 tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out);
6794
6795
6796 loc = mi2[k].loc*mi2[k].dir;
6797 err = mi2[k].err;
6798 sc = mi2[k].score;
6799
6800 tmp = fwrite(&loc, sizeof(int), 1, out);
6801 tmp = fwrite(&err, sizeof(int), 1, out);
6802 tmp = fwrite(&sc, sizeof(float), 1, out);
6803
6804 tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out);
6805 tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out);
6806
6807 tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out);
6808 tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out);
6809
6810
6811 _msf_discordantMapping[i*2]++;
6812 }
6813 //SET THE BEST DISCORDANT
6814 //BEGIN {Farhad Hormozdiari}
6815 if( bestHitMappingInfo[i*2].loc == -1 &&
6816 bestHitMappingInfo[i*2+1].loc == -1 &&
6817 _msf_readHasConcordantMapping[i] == 0)
6818 {
6819 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6820 _msf_seqList[i*2].hits[0] = 1;
6821 _msf_seqList[i*2+1].hits[0] = 1;
6822 }
6823 else if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err
6824 && _msf_readHasConcordantMapping[i] == 0)
6825 {
6826 if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err &&
6827 findNearest( abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc),
6828 abs(mi1[j].loc - mi2[k].loc),
6829 meanDistanceMapping
6830 ) == 0
6831 )
6832 {
6833 continue;
6834 }
6835 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6836 }
6837 //END {Farhad Hormozdiari}
6838 }
6839 }
6840 }
6841 }
6842 else
6843 {
6844 for (j=0; j<size1; j++)
6845 {
6846 for(k = 0; k < size2; k++)
6847 {
6848 if((mi2[k].loc-mi1[j].loc >= minPairEndedDistance &&
6849 mi2[k].loc-mi1[j].loc <= maxPairEndedDistance &&
6850 mi1[j].dir > 0 && mi2[k].dir < 0)
6851 ||
6852 (mi1[j].loc-mi2[k].loc >= minPairEndedDistance &&
6853 mi1[j].loc-mi2[k].loc <= maxPairEndedDistance &&
6854 mi1[j].dir < 0 && mi2[k].dir > 0)
6855 )
6856 {
6857 char *seq;
6858 char *qual;
6859 char d1;
6860 char d2;
6861 int isize;
6862 int proper=0;
6863 // ISIZE CALCULATION
6864 // The distance between outer edges
6865 isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-2;
6866 if (mi1[j].loc - mi2[k].loc > 0)
6867 {
6868 isize *= -1;
6869 }
6870
6871 d1 = (mi1[j].dir == -1)?1:0;
6872 d2 = (mi2[k].dir == -1)?1:0;
6873
6874 //SET THE READ HAS CONCORDANT MAPPING
6875 _msf_readHasConcordantMapping[i] = 1;
6876
6877 if ( d1 )
6878 {
6879 seq = rseq1;
6880 qual = rqual1;
6881 }
6882 else
6883 {
6884 seq = seq1;
6885 qual = qual1;
6886 }
6887
6888 if ((mi1[j].loc < mi2[k].loc && !d1 && d2) ||
6889 (mi1[j].loc > mi2[k].loc && d1 && !d2) )
6890 {
6891 proper = 2;
6892 }
6893 else
6894 {
6895 proper = 0;
6896 }
6897
6898
6899 _msf_output.POS = mi1[j].loc;
6900 _msf_output.MPOS = mi2[k].loc;
6901 _msf_output.FLAG = 1+proper+16*d1+32*d2+64;
6902 _msf_output.ISIZE = isize;
6903 _msf_output.SEQ = seq,
6904 _msf_output.QUAL = qual;
6905 _msf_output.QNAME = _msf_seqList[i*2].name;
6906 _msf_output.RNAME = _msf_refGenName;
6907 _msf_output.MAPQ = 255;
6908 _msf_output.CIGAR = cigar;
6909 _msf_output.MRNAME = "=";
6910
6911 _msf_output.optSize = 2;
6912 _msf_output.optFields = _msf_optionalFields;
6913
6914 _msf_optionalFields[0].tag = "NM";
6915 _msf_optionalFields[0].type = 'i';
6916 _msf_optionalFields[0].iVal = mi1[j].err;
6917
6918 _msf_optionalFields[1].tag = "MD";
6919 _msf_optionalFields[1].type = 'Z';
6920 _msf_optionalFields[1].sVal = mi1[j].md;
6921
6922 if(!bestMode)
6923 output(_msf_output);
6924
6925 if ( d2 )
6926 {
6927 seq = rseq2;
6928 qual = rqual2;
6929 }
6930 else
6931 {
6932 seq = seq2;
6933 qual = qual2;
6934 }
6935
6936 _msf_output.POS = mi2[k].loc;
6937 _msf_output.MPOS = mi1[j].loc;
6938 _msf_output.FLAG = 1+proper+16*d2+32*d1+128;
6939 _msf_output.ISIZE = -isize;
6940 _msf_output.SEQ = seq,
6941 _msf_output.QUAL = qual;
6942 _msf_output.QNAME = _msf_seqList[i*2].name;
6943 _msf_output.RNAME = _msf_refGenName;
6944 _msf_output.MAPQ = 255;
6945 _msf_output.CIGAR = cigar;
6946 _msf_output.MRNAME = "=";
6947
6948 _msf_output.optSize = 2;
6949 _msf_output.optFields = _msf_optionalFields;
6950
6951 _msf_optionalFields[0].tag = "NM";
6952 _msf_optionalFields[0].type = 'i';
6953 _msf_optionalFields[0].iVal = mi2[k].err;;
6954
6955 _msf_optionalFields[1].tag = "MD";
6956 _msf_optionalFields[1].type = 'Z';
6957 _msf_optionalFields[1].sVal = mi2[k].md;
6958
6959 if(!bestMode)
6960 output(_msf_output);
6961 //SET THE BEST CONCORDANT
6962 //BEGIN {Farhad Hormozdiari}
6963 if(bestHitMappingInfo[i*2].loc == -1 && bestHitMappingInfo[i*2+1].loc == -1)
6964 {
6965 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6966 }
6967 else
6968 {
6969 if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err)
6970 {
6971
6972 if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err &&
6973 findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc),
6974 abs(mi2[k].loc - mi1[j].loc),
6975 meanDistanceMapping
6976 ) == 0 )
6977 {
6978 continue;
6979 }
6980 setPairFullMappingInfo(i, mi1[j], mi2[k]);
6981 }
6982 }
6983 //END {Farhad Hormozdiari}
6984 }
6985 }
6986 }
6987
6988 }
6989 }
6990
6991 freeMem(rqual1, 0);
6992 freeMem(rqual2, 0);
6993
6994 if (pairedEndDiscordantMode)
6995 {
6996 fclose(out);
6997 fclose(out1);
6998 }
6999
7000 for (i=0; i<_msf_openFiles; i++)
7001 {
7002 fclose(in1[i]);
7003 fclose(in2[i]);
7004
7005 unlink(fname1[i]);
7006 unlink(fname2[i]);
7007 }
7008
7009 freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize);
7010 freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize);
7011
7012 _msf_openFiles = 0;
7013 }
7014
7015 /**********************************************/
7016 /**********************************************/
7017 /**********************************************/
7018 /**********************************************/
7019 float str2int(char *str, int index1, int index2)
7020 {
7021 char tmp[200];
7022 strncpy(tmp, &str[index1], index2-index1);
7023 tmp[index2-index1] = '\0';
7024 return atol(tmp);
7025 }
7026
7027 float calculateScore(int index, char *seq, char *qual,char *md)
7028 {
7029 int i;
7030 int j;
7031 char *ref;
7032 char *ver;
7033
7034 ref = _msf_refGen + index-1;
7035 ver = seq;
7036 float score = 1;
7037
7038 char tmp[200];
7039 int value = 0;
7040 int end = 0;
7041 int index1 = 0;
7042 int index2 = 0;
7043
7044 i=0;
7045 while(1)
7046 {
7047
7048 if(i>=strlen(md))
7049 break;
7050
7051 index1 = i;
7052
7053 while(md[i] >='0' && md[i]<='9')
7054 {
7055 i++;
7056 }
7057
7058 index2 = i;
7059
7060 value = str2int(md, index1,index2);
7061
7062 if(md[i]=='M')
7063 {
7064 for(j=0;j<value;j++)
7065 {
7066 tmp[end]='M';
7067 end++;
7068 }
7069 }
7070 else if(md[i]=='I')
7071 {
7072 for(j=0;j<value;j++)
7073 {
7074 tmp[end]='I';
7075 end++;
7076 }
7077
7078 }
7079 else if(md[i] == 'D')
7080 {
7081 for(j=0;j<value;j++)
7082 {
7083 tmp[end]='D';
7084 end++;
7085 }
7086 }
7087 i++;
7088 }
7089
7090 tmp[end] = '\0';
7091
7092 j = 0;
7093
7094 for (i = 0; i < end; i++)
7095 {
7096 if(tmp[i] == 'M')
7097 {
7098 if (*ref != *ver)
7099 {
7100 score *= 0.001 + 1/pow( 10, ((qual[j]-33)/10.0) );
7101 }
7102
7103 ref++;
7104 ver++;
7105 j++;
7106 }
7107 else if(tmp[i] == 'I')
7108 {
7109 ver++;
7110 j++;
7111 }
7112 else if(tmp[i] == 'D')
7113 {
7114 ref++;
7115 }
7116 }
7117
7118 return score;
7119 }
7120
7121 int matoi(char *str, int start, int end)
7122 {
7123 int i = 0;
7124 char tmp[200];
7125
7126 for(i=0;i < end-start; i++)
7127 tmp[i] = str[start+i];
7128 tmp[i]='\0';
7129
7130 return atoi(tmp);
7131 }
7132
7133 void convertCigarToMatrix(char *cigar, int cigar_size, char * matrix)
7134 {
7135 int i = 0;
7136 int j = 0;
7137
7138 int start = 0;
7139 int size = 0;
7140
7141 matrix[0] = '\0';
7142
7143 while(i < cigar_size)
7144 {
7145 if(cigar[i] >= '0' && cigar[i] <= '9')
7146 {
7147 start = i;
7148
7149 while(cigar[i] >= '0' && cigar[i] <= '9' && i < cigar_size)
7150 i++;
7151
7152 int value = matoi(cigar, start, i);
7153 for(j = 0; j < value; j++)
7154 {
7155 if(cigar[i] == 'M')
7156 matrix[size] = 'M';
7157 else if(cigar[i] == 'D')
7158 matrix[size] ='D';
7159 else if(cigar[i] == 'I')
7160 matrix[size] = 'I';
7161 size++;
7162 }
7163 }
7164 i++;
7165 }
7166 matrix[size] = '\0';
7167 }
7168
7169
7170
7171 void convertMDToMatrix(char *md, int md_size, char * matrix)
7172 {
7173 int i = 0;
7174 int j = 0;
7175
7176 int start = 0;
7177 int size = 0;
7178
7179 matrix[0] = '\0';
7180
7181 while(i < md_size)
7182 {
7183 if(md[i] >= '0' && md[i] <= '9')
7184 {
7185 start = i;
7186
7187 while(md[i] >= '0' && md[i] <= '9' && i < md_size)
7188 i++;
7189
7190 int value = matoi(md, start, i);
7191 for(j = 0; j < value; j++)
7192 {
7193 matrix[size] = 'M';
7194 size++;
7195 }
7196 i--;
7197 }
7198 else if(md[i] == '^')
7199 {
7200 matrix[size] = 'D';
7201 size++;
7202 }
7203 else
7204 {
7205 matrix[size] = md[i];
7206 size++;
7207 }
7208 //size++;
7209 i++;
7210 }
7211 matrix[size] = '\0';
7212 }
7213
7214
7215 void convertMDCigarToMatrix(char *cigar, int cigar_size, char *md, int md_size, char *matrix)
7216 {
7217 int i = 0;
7218 int j = 0;
7219
7220 int size = 0;
7221
7222 char tmp1[200];
7223 char tmp2[200];
7224 convertMDToMatrix(md,md_size, tmp2);
7225
7226 convertCigarToMatrix(cigar, cigar_size,tmp1);
7227
7228
7229
7230 while(i < strlen(tmp1))
7231 {
7232 if(tmp1[i]=='M')
7233 {
7234 if(j < strlen(tmp2))
7235 {
7236 if(tmp2[j]=='M')
7237 {
7238 matrix[size]='M';
7239 size++;
7240 }
7241 if(tmp2[j]!='M')
7242 {
7243 matrix[size]=tmp2[j];
7244 size++;
7245 }
7246 }
7247 else
7248 {
7249 matrix[size]='M';
7250 size++;
7251 }
7252 }
7253 else if(tmp1[i] == 'D')
7254 {
7255 matrix[size]='D';
7256 size++;
7257 j++;
7258 matrix[size]=tmp2[j];
7259 size++;
7260
7261 }
7262 else if(tmp1[i] == 'I')
7263 {
7264 matrix[size]='I';
7265 size++;
7266 }
7267
7268 i++;
7269 if(j < strlen(tmp2))
7270 j++;
7271 }
7272
7273 if(strlen(tmp1))
7274
7275 matrix[size] = '\0';
7276
7277 }
7278
7279 void convertInsertion(char * in_matrix, char * seq, char *out_matrix)
7280 {
7281 int i = 0;
7282 int j = 0;
7283 int size = 0;
7284
7285 while( i < strlen(in_matrix))
7286 {
7287 if(in_matrix[i] == 'M')
7288 {
7289 out_matrix[size] = 'M';
7290 size++;
7291 j++;
7292 }
7293 else if(in_matrix[i] == 'D')
7294 {
7295 out_matrix[size] = 'D';
7296 size++;
7297
7298 i++;
7299 j++;
7300
7301 out_matrix[size] = seq[j];
7302 j++;
7303 size++;
7304 }
7305 else if(in_matrix[i] == 'I')
7306 {
7307 out_matrix[size] = 'I';
7308 size++;
7309 out_matrix[size] = seq[j];
7310 size++;
7311 j++;
7312 }
7313 else
7314 {
7315 out_matrix[size] = in_matrix[i];
7316 size++;
7317 j++;
7318 }
7319 i++;
7320 }
7321 out_matrix[size] = '\0';
7322 }
7323
7324 /**********************************************/
7325 void outputPairedEndDiscPP()
7326 {
7327 char tmp_matrix1[200];
7328 char tmp_matrix2[200];
7329
7330 char matrix1[200];
7331 char matrix2[200];
7332
7333 char cigar1[200];
7334 char editString1[200];
7335
7336 char cigar2[200];
7337 char editString2[200];
7338
7339 char seq1[SEQ_LENGTH+1];
7340 char qual1[SEQ_LENGTH+1];
7341
7342 char seq2[SEQ_LENGTH+1];
7343 char qual2[SEQ_LENGTH+1];
7344
7345 char genName[SEQ_LENGTH];
7346 char fname1[FILE_NAME_LENGTH];
7347 char fname2[FILE_NAME_LENGTH];
7348 char l;
7349 int l_size;
7350 int loc1, loc2;
7351 int err1, err2;
7352 char dir1, dir2;
7353 float sc1, sc2, lsc=0;
7354 int flag = 0;
7355 int rNo,lrNo = -1;
7356 int tmp;
7357 FILE *in, *out;
7358
7359 sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput);
7360 sprintf(fname2, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput);
7361
7362 in = fileOpen(fname1, "r");
7363 out = fileOpen(fname2, "w");
7364
7365 if (in != NULL)
7366 {
7367 flag = fread(&rNo, sizeof(int), 1, in);
7368 }
7369 else
7370 {
7371 flag = 0;
7372 }
7373
7374 seq1[SEQ_LENGTH] = '\0';
7375 qual1[SEQ_LENGTH] = '\0';
7376
7377 seq2[SEQ_LENGTH] = '\0';
7378 qual2[SEQ_LENGTH] = '\0';
7379
7380 while (flag)
7381 {
7382 tmp = fread(&l, sizeof(char), 1, in);
7383 tmp = fread(genName, sizeof(char), l, in);
7384 genName[(int)l]='\0';
7385 tmp = fread(&loc1, sizeof(int), 1, in);
7386 tmp = fread(&err1, sizeof(int), 1, in);
7387 tmp = fread(&sc1, sizeof(float), 1, in);
7388
7389 //tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out);
7390
7391 tmp = fread(&l_size, sizeof(int), 1, in);
7392 tmp = fread(cigar1, sizeof(char), l_size, in);
7393 cigar1[(int)l_size]='\0';
7394 //tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out);
7395
7396 //tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out);
7397 tmp = fread(&l_size, sizeof(int), 1, in);
7398 tmp = fread(editString1, sizeof(char), l_size, in);
7399 editString1[(int)l_size]='\0';
7400 //tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out);
7401
7402 tmp = fread(&loc2, sizeof(int), 1, in);
7403 tmp = fread(&err2, sizeof(int), 1, in);
7404 tmp = fread(&sc2, sizeof(float), 1, in);
7405
7406 tmp = fread(&l_size, sizeof(int), 1, in);
7407 tmp = fread(cigar2, sizeof(char), l_size, in);
7408 cigar2[(int)l_size]='\0';
7409
7410 tmp = fread(&l_size, sizeof(int), 1, in);
7411 tmp = fread(editString2, sizeof(char), l_size, in);
7412 editString2[(int)l_size]='\0';
7413
7414 convertMDCigarToMatrix(cigar1, strlen(cigar1), editString1, strlen(editString1), tmp_matrix1);
7415 convertMDCigarToMatrix(cigar2, strlen(cigar2), editString2, strlen(editString2), tmp_matrix2);
7416
7417
7418 if(_msf_readHasConcordantMapping[rNo] == 0)
7419 {
7420
7421 dir1 = dir2 = 'F';
7422
7423 strncpy(seq1, _msf_seqList[rNo*2].seq, SEQ_LENGTH);
7424 strncpy(seq2, _msf_seqList[rNo*2+1].seq, SEQ_LENGTH);
7425
7426 if (loc1 < 0)
7427 {
7428 dir1 = 'R';
7429 loc1 = -loc1;
7430
7431 strncpy(seq1, _msf_seqList[rNo*2].rseq, SEQ_LENGTH);
7432 }
7433
7434 if (loc2 < 0)
7435 {
7436 dir2 = 'R';
7437 loc2 = -loc2;
7438
7439 strncpy(seq2, _msf_seqList[rNo*2+1].rseq, SEQ_LENGTH);
7440 }
7441
7442 convertInsertion(tmp_matrix1, seq1, matrix1);
7443 convertInsertion(tmp_matrix2, seq2, matrix2);
7444
7445
7446 if (rNo != lrNo)
7447 {
7448 int j;
7449 for (j=0; j<SEQ_LENGTH; j++)
7450 {
7451 lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j];
7452 }
7453 lsc /= 2*SEQ_LENGTH;
7454 lsc -= 33;
7455 lrNo = rNo;
7456 }
7457
7458 char event = '\0';
7459
7460
7461 if ( dir1 == dir2 )
7462 {
7463 event = 'V';
7464 }
7465 else
7466 {
7467 if (loc1 < loc2)
7468 {
7469
7470 if (dir1 == 'R' && dir2 == 'F')
7471 {
7472 event = 'E';
7473
7474 }
7475 else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance )
7476 {
7477 event = 'D';
7478 }
7479 else
7480 {
7481 event = 'I';
7482 }
7483 }
7484 else if (loc2 < loc1)
7485 {
7486 if (dir2 == 'R' && dir1 == 'F')
7487 {
7488 event = 'E';
7489 }
7490 else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance )
7491 {
7492 event = 'D';
7493 }
7494 else
7495 {
7496 event = 'I';
7497 }
7498 }
7499 }
7500 _msf_seqList[rNo*2].hits[0] = 2;
7501 if(event != 'E')
7502 fprintf(out, "%s\t%s\t%d\t%d\t%c\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%e\n",
7503 _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1,
7504 loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2);
7505
7506 }
7507 flag = fread(&rNo, sizeof(int), 1, in);
7508 }
7509
7510 fclose(in);
7511 fclose(out);
7512
7513 unlink(fname1);
7514 }
7515
7516 void finalizeOEAReads(char *fileName)
7517 {
7518 FILE *fp_out1;
7519 FILE * in;
7520
7521 char genName[SEQ_LENGTH];
7522
7523 char fname1[FILE_NAME_LENGTH];
7524 char fname2[FILE_NAME_LENGTH];
7525
7526 char l=0;
7527 int loc1=0;
7528
7529 int err1;
7530
7531 char d;
7532
7533 float sc1=0;
7534 int flag = 0;
7535 int rNo=-1;
7536 int tmp=0;
7537
7538 int cigarSize = 0;
7539 int mdSize = 0;
7540
7541 char cigar[SEQ_LENGTH+1];
7542 char md[SEQ_LENGTH+1];
7543
7544 char *seq1, *seq2, *qual1, *qual2;
7545 char *rqual1, *rqual2;
7546
7547 seq1=NULL; seq2=NULL; qual1=NULL; qual2=NULL;
7548
7549 rqual1 = getMem(200*sizeof(char));
7550 rqual2 = getMem(200*sizeof(char));
7551
7552 rqual1[0] = '\0';
7553 rqual2[0] = '\0';
7554
7555 sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput);
7556
7557 fp_out1 = fileOpen(fname1, "w");
7558
7559 in = NULL;
7560 if (pairedEndDiscordantMode){
7561 sprintf(fname2, "%s__%s__oea", mappingOutputPath, mappingOutput);
7562
7563 in = fileOpen(fname2, "r");
7564 }
7565
7566
7567 if (in != NULL)
7568 {
7569 flag = fread(&rNo, sizeof(int), 1, in);
7570 }
7571 else
7572 {
7573 flag = 0;
7574 }
7575
7576 while (flag)
7577 {
7578 cigar[0] = '\0';
7579 md[0] = '\0';
7580
7581 tmp = fread(&l, sizeof(char), 1, in);
7582 tmp = fread(genName, sizeof(char), l, in);
7583
7584 genName[(int)l]='\0';
7585
7586
7587 tmp = fread(&loc1, sizeof(int), 1, in);
7588 tmp = fread(&err1, sizeof(int), 1, in);
7589 tmp = fread(&sc1, sizeof(float), 1, in);
7590
7591 tmp = fread (&cigarSize, sizeof(int), 1, in);
7592 tmp = fread (cigar, sizeof(char), cigarSize, in);
7593
7594 cigar[cigarSize] = '\0';
7595
7596 tmp = fread (&mdSize, sizeof(int), 1, in);
7597 tmp = fread (md, sizeof(char), mdSize, in);
7598 md[mdSize] = '\0';
7599
7600 d = 1;
7601
7602 if(loc1 < 0)
7603 {
7604 d = -1;
7605 loc1 *= -1;
7606
7607 seq1 = _msf_seqList[rNo].rseq;
7608 reverse(_msf_seqList[rNo].qual, rqual1, SEQ_LENGTH);
7609 rqual1[SEQ_LENGTH] = '\0';
7610 }
7611 else
7612 {
7613 seq1 = _msf_seqList[rNo].seq;
7614 qual1 = _msf_seqList[rNo].qual;
7615 }
7616
7617 if(rNo % 2 == 0)
7618 {
7619 seq2 = _msf_seqList[rNo+1].seq;
7620 qual2 = _msf_seqList[rNo+1].qual;
7621 }
7622 else
7623 {
7624 seq2 = _msf_seqList[rNo-1].seq;
7625 qual2 = _msf_seqList[rNo-1].qual;
7626 }
7627
7628 if(_msf_seqHits[rNo] != 0 && _msf_seqHits[(rNo%2==0)?rNo+1:rNo-1] == 0)
7629 {
7630 _msf_output.POS = loc1;
7631 _msf_output.MPOS = 0;
7632 _msf_output.FLAG = (rNo % 2 ==0)? 1+4+32*d+128 : 1+8+16*d+64 ;
7633 _msf_output.ISIZE = 0;
7634 _msf_output.SEQ = seq1;
7635 _msf_output.QUAL = qual1;
7636 _msf_output.QNAME = _msf_seqList[rNo].name;
7637 _msf_output.RNAME = genName;
7638 _msf_output.MAPQ = 255;
7639 _msf_output.CIGAR = cigar;
7640 _msf_output.MRNAME = "=";
7641
7642
7643 _msf_output.optSize = 4;
7644 _msf_output.optFields = _msf_optionalFields;
7645
7646 _msf_optionalFields[0].tag = "NM";
7647 _msf_optionalFields[0].type = 'i';
7648 _msf_optionalFields[0].iVal = err1;
7649
7650 _msf_optionalFields[1].tag = "MD";
7651 _msf_optionalFields[1].type = 'Z';
7652 _msf_optionalFields[1].sVal = md;
7653
7654
7655
7656 //for the OEA reads
7657 _msf_optionalFields[2].tag = "NS";
7658 _msf_optionalFields[2].type = 'Z';
7659 _msf_optionalFields[2].sVal = seq2;
7660
7661
7662 _msf_optionalFields[3].tag = "NQ";
7663 _msf_optionalFields[3].type = 'Z';
7664 _msf_optionalFields[3].sVal = qual2;
7665
7666 outputSAM(fp_out1, _msf_output);
7667
7668 _msf_seqList[rNo].hits[0] = -1;
7669 _msf_seqList[(rNo%2==0)?rNo+1:rNo-1].hits[0] = -1;
7670 }
7671 flag = fread(&rNo, sizeof(int), 1, in);
7672 }
7673
7674 freeMem(rqual1, 0);
7675 freeMem(rqual2, 0);
7676
7677 unlink(fname2);
7678
7679 fclose(fp_out1);
7680 }
7681
7682 /*
7683
7684 void outputOEA(char *fileName1, FILE * fp_out, int readSegment)
7685 {
7686 int i = 0;
7687 int j = 0;
7688
7689 char *index;
7690
7691 int size1 = 0;
7692
7693 FILE *fp1;
7694
7695 char geneFileName1[FILE_NAME_LENGTH];
7696
7697 char matrix[200];
7698 char cigar[MAX_CIGAR_SIZE];
7699 char editString[200];
7700
7701 FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink));
7702
7703 if(fileName1 != NULL)
7704 {
7705
7706 fp1 = fileOpen(fileName1, "r");
7707
7708 index = strstr(fileName1, "__");
7709 strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2);
7710 geneFileName1[strstr(index + 2, "__") - index - 2] = '\0';
7711
7712 for(i = 0; i < _msf_seqListSize / 2; i++)
7713 {
7714 fread(&size1, sizeof(int), 1, fp1);
7715
7716 miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) );
7717
7718 miL[i].size = size1;
7719
7720 for(j = 0; j < size1; j++)
7721 {
7722 fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1);
7723
7724 fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1);
7725
7726 fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1);
7727 fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1);
7728
7729 fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1);
7730 fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1);
7731
7732 miL[i].mi[j].dir = 1;
7733 if(miL[i].mi[j].loc < 1)
7734 {
7735 miL[i].mi[j].loc *= -1;
7736 miL[i].mi[j].dir = -1;
7737 }
7738 }
7739
7740 int tmpSize = (readSegment==0) ? _msf_seqHits[i*2+1] : _msf_seqHits[i*2];
7741
7742 if(_msf_seqHits[i*2+readSegment] == 0 && size1 != 0 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput)
7743 {
7744 int d1 = 0;
7745
7746 char *seq, *qual;
7747 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2;
7748 char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1];
7749
7750 rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0';
7751 seq1 = _msf_seqList[i*2].seq;
7752 rseq1 = _msf_seqList[i*2].rseq;
7753 qual1 = _msf_seqList[i*2].qual;
7754 reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH);
7755
7756 seq2 = _msf_seqList[i*2+1].seq;
7757 rseq2 = _msf_seqList[i*2+1].rseq;
7758 qual2 = _msf_seqList[i*2+1].qual;
7759 reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH);
7760
7761 for(j = 0; j < size1 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput; j++)
7762 {
7763 d1 = (miL[i].mi[j].dir == -1)?1:0;
7764
7765 if(readSegment == 0)
7766 {
7767 if ( d1 )
7768 {
7769 seq = rseq2;
7770 qual = rqual2;
7771 }
7772 else
7773 {
7774 seq = seq2;
7775 qual = qual2;
7776 }
7777 }
7778 else
7779 {
7780 if ( d1 )
7781 {
7782 seq = rseq1;
7783 qual = rqual1;
7784 }
7785 else
7786 {
7787 seq = seq1;
7788 qual = qual1;
7789 }
7790 }
7791
7792 _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)]++;
7793
7794 _msf_output.POS = (readSegment==1)?miL[i].mi[j].loc:0;
7795 _msf_output.MPOS = (readSegment==0)?miL[i].mi[j].loc:0;
7796 _msf_output.FLAG = (readSegment==0)? 1+4+32*d1+128 : 1+8+16*d1+64 ;
7797 _msf_output.ISIZE = 0;
7798 _msf_output.SEQ = seq,
7799 _msf_output.QUAL = qual;
7800 _msf_output.QNAME = _msf_seqList[i*2+(readSegment==0?1:0)].name;
7801 _msf_output.RNAME = geneFileName1;
7802 _msf_output.MAPQ = 255;
7803 _msf_output.CIGAR = miL[i].mi[j].cigar;
7804 _msf_output.MRNAME = "=";
7805 //_msf_output.NSEQ = (readSegment == 0)?seq1:seq2;
7806 //_msf_output.NQUAL = (readSegment == 0)?qual1:qual2;
7807
7808
7809 _msf_output.optSize = 4;
7810 _msf_output.optFields = _msf_optionalFields;
7811
7812 _msf_optionalFields[0].tag = "NM";
7813 _msf_optionalFields[0].type = 'i';
7814 _msf_optionalFields[0].iVal = miL[i].mi[j].err;
7815
7816 _msf_optionalFields[1].tag = "MD";
7817 _msf_optionalFields[1].type = 'Z';
7818 _msf_optionalFields[1].sVal = miL[i].mi[j].md;
7819
7820
7821
7822 //for the OEA reads
7823 _msf_optionalFields[2].tag = "NS";
7824 _msf_optionalFields[2].type = 'Z';
7825 _msf_optionalFields[2].sVal = (readSegment == 0)?seq1:seq2;
7826
7827
7828 _msf_optionalFields[3].tag = "NQ";
7829 _msf_optionalFields[3].type = 'Z';
7830 _msf_optionalFields[3].sVal = (readSegment == 0)?qual1:qual2;
7831
7832 outputSAM(fp_out, _msf_output);
7833
7834 }
7835 }
7836 }
7837
7838 }
7839
7840 for(i = 0; i < _msf_seqListSize / 2; i++)
7841 {
7842 freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo));
7843 }
7844
7845 freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink));
7846
7847 fclose(fp1);
7848 }
7849
7850 void finalizeOEAReads(char *fileName)
7851 {
7852
7853 int i = 0;
7854 int k = 0;
7855
7856 FILE *fp_out1;
7857 char fname1[200];
7858
7859 _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int));
7860 for(i = 0; i < _msf_seqListSize; i++)
7861 {
7862 _msf_oeaMapping[i] = 0;
7863 }
7864
7865 sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput);
7866
7867 fp_out1 = fileOpen(fname1, "w");
7868 for(i = 0; i < _msf_maxFile; i++)
7869 {
7870 for(k = 0; k < _msf_fileCount[i]; k++)
7871 {
7872 outputOEA(_msf_fileName[i][k][1], fp_out1, 0);
7873 }// for k
7874 } //for i
7875
7876
7877 for(i = 0; i < _msf_maxFile; i++)
7878 {
7879 for(k = 0; k < _msf_fileCount[i]; k++)
7880 {
7881 outputOEA(_msf_fileName[i][k][0], fp_out1, 1);
7882 }// for k
7883 } //for i
7884
7885 fclose(fp_out1);
7886
7887 }
7888 */
7889
7890 void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out)
7891 {
7892 int i = 0;
7893 int j = 0;
7894 int k = 0;
7895
7896 char *index;
7897
7898 int size1 = 0;
7899 int size2 = 0;
7900
7901 FILE *fp1 = NULL;
7902 FILE *fp2 = NULL;
7903
7904 char geneFileName1[FILE_NAME_LENGTH];
7905 char geneFileName2[FILE_NAME_LENGTH];
7906
7907 FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink));
7908 FullMappingInfoLink *miR = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink));
7909
7910
7911 if(fileName1 != NULL && fileName2 != NULL)
7912 {
7913
7914 fp1 = fileOpen(fileName1, "r");
7915 fp2 = fileOpen(fileName2, "r");
7916
7917 index = strstr(fileName1, "__");
7918 strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2);
7919 geneFileName1[strstr(index + 2, "__") - index - 2] = '\0';
7920
7921 index = strstr(fileName2, "__");
7922 strncpy(geneFileName2, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2);
7923 geneFileName2[strstr(index + 2, "__") - index - 2] = '\0';
7924
7925
7926 for(i = 0; i < _msf_seqListSize / 2; i++)
7927 {
7928 fread(&size1, sizeof(int), 1, fp1);
7929 fread(&size2, sizeof(int), 1, fp2);
7930
7931 miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) );
7932 miR[i].mi = getMem(size2 * sizeof(FullMappingInfo) );
7933
7934 miL[i].size = size1;
7935 miR[i].size = size2;
7936
7937 for(j = 0; j < size1; j++)
7938 {
7939 fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1);
7940
7941 fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1);
7942
7943 fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1);
7944 fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1);
7945
7946 fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1);
7947 fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1);
7948
7949 miL[i].mi[j].dir = 1;
7950 if(miL[i].mi[j].loc < 1)
7951 {
7952 miL[i].mi[j].loc *= -1;
7953 miL[i].mi[j].dir = -1;
7954 }
7955 }
7956 for(k = 0; k < size2; k++)
7957 {
7958 fread(&(miR[i].mi[k].loc), sizeof(int), 1, fp2);
7959
7960 fread (&(miR[i].mi[k].err), sizeof(int), 1, fp2);
7961
7962 fread (&(miR[i].mi[k].cigarSize), sizeof(int), 1, fp2);
7963 fread ((miR[i].mi[k].cigar), sizeof(char), miR[i].mi[k].cigarSize+1, fp2);
7964
7965 fread (&(miR[i].mi[k].mdSize), sizeof(int), 1, fp2);
7966 fread ((miR[i].mi[k].md), sizeof(char), miR[i].mi[k].mdSize+1, fp2);
7967
7968 miR[i].mi[k].dir = 1;
7969 if(miR[i].mi[k].loc < 1)
7970 {
7971 miR[i].mi[k].loc *= -1;
7972 miR[i].mi[k].dir = -1;
7973 }
7974 }
7975 if(_msf_readHasConcordantMapping[i] == 0 && size1 != 0 && size2 != 0 && (size1 * size2 < MAX_TRANS_CHROMOSAL_OUTPUT))
7976 {
7977 int d1 = 0;
7978 int d2 = 0;
7979 char *seq, *qual;
7980 char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2;
7981 char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1];
7982 rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0';
7983 seq1 = _msf_seqList[i*2].seq;
7984 rseq1 = _msf_seqList[i*2].rseq;
7985 qual1 = _msf_seqList[i*2].qual;
7986 reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH);
7987
7988 seq2 = _msf_seqList[i*2+1].seq;
7989 rseq2 = _msf_seqList[i*2+1].rseq;
7990 qual2 = _msf_seqList[i*2+1].qual;
7991 reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH);
7992
7993 for(j = 0; j < size1; j++)
7994 {
7995 d1 = (miL[i].mi[j].dir == -1)?1:0;
7996
7997 if ( d1 )
7998 {
7999 seq = rseq1;
8000 qual = rqual1;
8001 }
8002 else
8003 {
8004 seq = seq1;
8005 qual = qual1;
8006 }
8007
8008 for(k = 0; k < size2; k++)
8009 {
8010
8011 d2 = (miR[i].mi[k].dir == -1)?1:0;
8012
8013 _msf_output.POS = miL[i].mi[j].loc;
8014 _msf_output.MPOS = miR[i].mi[k].loc;
8015 _msf_output.FLAG = 0;
8016 _msf_output.ISIZE = 0;
8017 _msf_output.SEQ = seq,
8018 _msf_output.QUAL = qual;
8019 _msf_output.QNAME = _msf_seqList[i*2].name;
8020 _msf_output.RNAME = geneFileName1;
8021 _msf_output.MAPQ = 255;
8022 _msf_output.CIGAR = miL[i].mi[j].cigar;
8023 _msf_output.MRNAME = "=";
8024
8025 _msf_output.optSize = 2;
8026 _msf_output.optFields = _msf_optionalFields;
8027
8028 _msf_optionalFields[0].tag = "NM";
8029 _msf_optionalFields[0].type = 'i';
8030 _msf_optionalFields[0].iVal = miL[i].mi[j].err;
8031
8032 _msf_optionalFields[1].tag = "MD";
8033 _msf_optionalFields[1].type = 'Z';
8034 _msf_optionalFields[1].sVal = miL[i].mi[j].md;
8035
8036
8037 if ( d2 )
8038 {
8039 seq = rseq2;
8040 qual = rqual2;
8041 }
8042 else
8043 {
8044 seq = seq2;
8045 qual = qual2;
8046 }
8047
8048 outputSAM(fp_out, _msf_output);
8049
8050
8051 _msf_output.POS = miR[i].mi[k].loc;
8052 _msf_output.MPOS = miL[i].mi[j].loc;
8053 _msf_output.FLAG = 0;
8054 _msf_output.ISIZE = 0;
8055 _msf_output.SEQ = seq,
8056 _msf_output.QUAL = qual;
8057 _msf_output.QNAME = _msf_seqList[i*2+1].name;
8058 _msf_output.RNAME = geneFileName2;
8059 _msf_output.MAPQ = 255;
8060 _msf_output.CIGAR = miR[i].mi[k].cigar;
8061 _msf_output.MRNAME = "=";
8062
8063 _msf_output.optSize = 2;
8064 _msf_output.optFields = _msf_optionalFields;
8065
8066 _msf_optionalFields[0].tag = "NM";
8067 _msf_optionalFields[0].type = 'i';
8068 _msf_optionalFields[0].iVal = miR[i].mi[k].err;
8069
8070 _msf_optionalFields[1].tag = "MD";
8071 _msf_optionalFields[1].type = 'Z';
8072 _msf_optionalFields[1].sVal = miR[i].mi[k].md;
8073
8074 outputSAM(fp_out, _msf_output);
8075
8076 }
8077 }
8078 }
8079 }
8080
8081 }
8082
8083 for(i = 0; i < _msf_seqListSize / 2; i++)
8084 {
8085 freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo));
8086 freeMem(miR[i].mi, miR[i].size * sizeof(FullMappingInfo));
8087 }
8088
8089 freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink));
8090 freeMem(miR, _msf_seqListSize * sizeof(FullMappingInfoLink));
8091
8092 fclose(fp1);
8093 fclose(fp2);
8094 }
8095
8096 /*
8097 if flag is 1 it will output all the possible trans chromsal mapping
8098 otherwise only tmp file will be delete
8099
8100 */
8101
8102 void outputAllTransChromosal(int flag)
8103 {
8104
8105 int i = 0;
8106 int j = 0;
8107 int k = 0;
8108 int l = 0;
8109
8110 FILE *fp_out = NULL;
8111 char fname1[200];
8112
8113 if(flag)
8114 {
8115 fp_out = fileOpen(fname1, "w");
8116
8117 sprintf(fname1, "%s%s_TRANSCHROMOSOMAL", mappingOutputPath, mappingOutput);
8118
8119 // for(i = 0; i < _msf_maxFile; i++)
8120 // {
8121 i = 0;
8122 for(j = i+1; j < _msf_maxFile; j++)
8123 {
8124 if(i != j)
8125 {
8126 for(k = 0; k < _msf_fileCount[i]; k++)
8127 {
8128 for(l = 0; l < _msf_fileCount[j]; l++)
8129 {
8130 outputTransChromosal(_msf_fileName[i][k][0], _msf_fileName[j][l][1], fp_out);
8131 }// for l
8132 }// for k
8133 }// if
8134 }// for j
8135 // } //for i
8136 }
8137
8138 for(i = 0; i < _msf_maxFile; i++)
8139 {
8140 for(j = 0; j < _msf_fileCount[i]; j++)
8141 {
8142 unlink(_msf_fileName[i][j][0]);
8143 unlink(_msf_fileName[i][j][1]);
8144 }
8145 }
8146 if(flag)
8147 fclose(fp_out);
8148 }