| 0 | 1 #include <string.h> | 
|  | 2 #include <assert.h> | 
|  | 3 #include <unistd.h> | 
|  | 4 #include "kstring.h" | 
|  | 5 #include "sam_header.h" | 
|  | 6 #include "sam.h" | 
|  | 7 #include "bam.h" | 
|  | 8 #include "faidx.h" | 
|  | 9 | 
|  | 10 bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/ | 
|  | 11 | 
|  | 12 static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) | 
|  | 13 { | 
|  | 14 	if (n != b->core.n_cigar) { | 
|  | 15 		int o = b->core.l_qname + b->core.n_cigar * 4; | 
|  | 16 		if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) { | 
|  | 17 			b->m_data = b->data_len + (n - b->core.n_cigar) * 4; | 
|  | 18 			kroundup32(b->m_data); | 
|  | 19 			b->data = (uint8_t*)realloc(b->data, b->m_data); | 
|  | 20 		} | 
|  | 21 		memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o); | 
|  | 22 		memcpy(b->data + b->core.l_qname, cigar, n * 4); | 
|  | 23 		b->data_len += (n - b->core.n_cigar) * 4; | 
|  | 24 		b->core.n_cigar = n; | 
|  | 25 	} else memcpy(b->data + b->core.l_qname, cigar, n * 4); | 
|  | 26 } | 
|  | 27 | 
|  | 28 #define write_cigar(_c, _n, _m, _v) do { \ | 
|  | 29 		if (_n == _m) { \ | 
|  | 30 			_m = _m? _m<<1 : 4; \ | 
|  | 31 			_c = (uint32_t*)realloc(_c, _m * 4); \ | 
|  | 32 		} \ | 
|  | 33 		_c[_n++] = (_v); \ | 
|  | 34 	} while (0) | 
|  | 35 | 
|  | 36 static void unpad_seq(bam1_t *b, kstring_t *s) | 
|  | 37 { | 
|  | 38 	int k, j, i; | 
|  | 39 	int length; | 
|  | 40 	uint32_t *cigar = bam1_cigar(b); | 
|  | 41 	uint8_t *seq = bam1_seq(b); | 
|  | 42 	// b->core.l_qseq gives length of the SEQ entry (including soft clips, S) | 
|  | 43 	// We need the padded length after alignment from the CIGAR (excluding | 
|  | 44 	// soft clips S, but including pads from CIGAR D operations) | 
|  | 45 	length = 0; | 
|  | 46 	for (k = 0; k < b->core.n_cigar; ++k) { | 
|  | 47 		int op, ol; | 
|  | 48 		op= bam_cigar_op(cigar[k]); | 
|  | 49 		ol = bam_cigar_oplen(cigar[k]); | 
|  | 50 		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL) | 
|  | 51 			length += ol; | 
|  | 52 	} | 
|  | 53 	ks_resize(s, length); | 
|  | 54 	for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { | 
|  | 55 		int op, ol; | 
|  | 56 		op = bam_cigar_op(cigar[k]); | 
|  | 57 		ol = bam_cigar_oplen(cigar[k]); | 
|  | 58 		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { | 
|  | 59 			for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j); | 
|  | 60 		} else if (op == BAM_CSOFT_CLIP) { | 
|  | 61 			j += ol; | 
|  | 62 		} else if (op == BAM_CHARD_CLIP) { | 
|  | 63 			/* do nothing */ | 
|  | 64 		} else if (op == BAM_CDEL) { | 
|  | 65 			for (i = 0; i < ol; ++i) s->s[s->l++] = 0; | 
|  | 66                 } else { | 
|  | 67 			fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b)); | 
|  | 68                         assert(-1); | 
|  | 69 		} | 
|  | 70 	} | 
|  | 71 	assert(length == s->l); | 
|  | 72 } | 
|  | 73 | 
|  | 74 int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) | 
|  | 75 { | 
|  | 76 	char base; | 
|  | 77 	char *fai_ref = 0; | 
|  | 78 	int fai_ref_len = 0, k; | 
|  | 79 | 
|  | 80 	fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); | 
|  | 81 	if (fai_ref_len != ref_len) { | 
|  | 82 		fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); | 
|  | 83 		free(fai_ref); | 
|  | 84 		return -1; | 
|  | 85 	} | 
|  | 86 	ks_resize(seq, ref_len); | 
|  | 87 	seq->l = 0; | 
|  | 88 	for (k = 0; k < ref_len; ++k) { | 
|  | 89 		base = fai_ref[k]; | 
|  | 90 		if (base == '-' || base == '*') { | 
|  | 91 			// Map gaps to null to match unpad_seq function | 
|  | 92 			seq->s[seq->l++] = 0; | 
|  | 93 		} else { | 
|  | 94 			int i = bam_nt16_table[(int)base]; | 
|  | 95 			if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 | 
|  | 96 				fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); | 
|  | 97 				free(fai_ref); | 
|  | 98 				return -1; | 
|  | 99 			} | 
|  | 100 			seq->s[seq->l++] = i; | 
|  | 101 		} | 
|  | 102 	} | 
|  | 103 	assert(ref_len == seq->l); | 
|  | 104 	free(fai_ref); | 
|  | 105 	return 0; | 
|  | 106 } | 
|  | 107 | 
|  | 108 int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) | 
|  | 109 { | 
|  | 110 	char base; | 
|  | 111 	char *fai_ref = 0; | 
|  | 112 	int fai_ref_len = 0, k; | 
|  | 113 	int bases=0, gaps=0; | 
|  | 114 | 
|  | 115 	fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); | 
|  | 116 	if (fai_ref_len != padded_len) { | 
|  | 117 		fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); | 
|  | 118 		free(fai_ref); | 
|  | 119 		return -1; | 
|  | 120 	} | 
|  | 121 	for (k = 0; k < padded_len; ++k) { | 
|  | 122 		//fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); | 
|  | 123 		base = fai_ref[k]; | 
|  | 124 		if (base == '-' || base == '*') { | 
|  | 125 			gaps += 1; | 
|  | 126 		} else { | 
|  | 127 			int i = bam_nt16_table[(int)base]; | 
|  | 128 			if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 | 
|  | 129 				fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); | 
|  | 130 				free(fai_ref); | 
|  | 131 				return -1; | 
|  | 132 			} | 
|  | 133 			bases += 1; | 
|  | 134 		} | 
|  | 135 	} | 
|  | 136 	free(fai_ref); | 
|  | 137 	assert (padded_len == bases + gaps); | 
|  | 138 	return bases; | 
|  | 139 } | 
|  | 140 | 
|  | 141 inline int * update_posmap(int *posmap, kstring_t ref) | 
|  | 142 { | 
|  | 143 	int i, k; | 
|  | 144 	posmap = realloc(posmap, ref.m * sizeof(int)); | 
|  | 145 	for (i = k = 0; i < ref.l; ++i) { | 
|  | 146 		posmap[i] = k; | 
|  | 147 		if (ref.s[i]) ++k; | 
|  | 148 	} | 
|  | 149 	return posmap; | 
|  | 150 } | 
|  | 151 | 
|  | 152 int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) | 
|  | 153 { | 
|  | 154 	bam_header_t *h = 0; | 
|  | 155 	bam1_t *b = 0; | 
|  | 156 	kstring_t r, q; | 
|  | 157 	int r_tid = -1; | 
|  | 158 	uint32_t *cigar2 = 0; | 
|  | 159 	int ret = 0, n2 = 0, m2 = 0, *posmap = 0; | 
|  | 160 | 
|  | 161 	b = bam_init1(); | 
|  | 162 	r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; | 
|  | 163 	int read_ret; | 
|  | 164 	h = in->header; | 
|  | 165 	while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in' | 
|  | 166 		uint32_t *cigar = bam1_cigar(b); | 
|  | 167 		n2 = 0; | 
|  | 168 		if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { | 
|  | 169 			// fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b)); | 
|  | 170 			r_tid = b->core.tid; | 
|  | 171 			unpad_seq(b, &r); | 
|  | 172 			if (h->target_len[r_tid] != r.l) { | 
|  | 173 				fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); | 
|  | 174 				return -1; | 
|  | 175 			} | 
|  | 176 			if (fai) { | 
|  | 177 				// Check the embedded reference matches the FASTA file | 
|  | 178 				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { | 
|  | 179 					fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); | 
|  | 180 					return -1; | 
|  | 181 				} | 
|  | 182 				assert(r.l == q.l); | 
|  | 183 				int i; | 
|  | 184 				for (i = 0; i < r.l; ++i) { | 
|  | 185 					if (r.s[i] != q.s[i]) { | 
|  | 186 						// Show gaps as ASCII 45 | 
|  | 187 						fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", | 
|  | 188 							h->target_name[b->core.tid], i+1, | 
|  | 189 							r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45, | 
|  | 190 							q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45); | 
|  | 191 						return -1; | 
|  | 192 					} | 
|  | 193 				} | 
|  | 194 			} | 
|  | 195 			write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); | 
|  | 196 			replace_cigar(b, n2, cigar2); | 
|  | 197 			posmap = update_posmap(posmap, r); | 
|  | 198 		} else if (b->core.n_cigar > 0) { | 
|  | 199 			int i, k, op; | 
|  | 200 			if (b->core.tid < 0) { | 
|  | 201 				fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b)); | 
|  | 202 				return -1; | 
|  | 203 			} else if (b->core.tid == r_tid) { | 
|  | 204 				; // good case, reference available | 
|  | 205 				//fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b)); | 
|  | 206 			} else if (fai) { | 
|  | 207 				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { | 
|  | 208 					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); | 
|  | 209 					return -1; | 
|  | 210 				} | 
|  | 211 				posmap = update_posmap(posmap, r); | 
|  | 212 				r_tid = b->core.tid; | 
|  | 213 				// fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); | 
|  | 214 			} else { | 
|  | 215 				fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); | 
|  | 216 				return -1; | 
|  | 217 			} | 
|  | 218 			unpad_seq(b, &q); | 
|  | 219 			if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { | 
|  | 220 				write_cigar(cigar2, n2, m2, cigar[0]); | 
|  | 221 			} else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { | 
|  | 222 				write_cigar(cigar2, n2, m2, cigar[0]); | 
|  | 223 				if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { | 
|  | 224 					write_cigar(cigar2, n2, m2, cigar[1]); | 
|  | 225 				} | 
|  | 226 			} | 
|  | 227 			/* Determine CIGAR operator for each base in the aligned read */ | 
|  | 228 			for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) | 
|  | 229 				q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); | 
|  | 230 			/* Include any pads if starts with an insert */ | 
|  | 231 			if (q.s[0] == BAM_CINS) { | 
|  | 232 				for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); | 
|  | 233 				if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); | 
|  | 234 			} | 
|  | 235 			/* Count consecutive CIGAR operators to turn into a CIGAR string */ | 
|  | 236 			for (i = k = 1, op = q.s[0]; i < q.l; ++i) { | 
|  | 237 				if (op != q.s[i]) { | 
|  | 238 					write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); | 
|  | 239 					op = q.s[i]; k = 1; | 
|  | 240 				} else ++k; | 
|  | 241 			} | 
|  | 242 			write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); | 
|  | 243 			if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { | 
|  | 244 				write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); | 
|  | 245                         } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { | 
|  | 246 				if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { | 
|  | 247 					write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); | 
|  | 248 			  	} | 
|  | 249 				write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); | 
|  | 250 			} | 
|  | 251 			/* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ | 
|  | 252 			int pre_op, post_op; | 
|  | 253 			for (i = 2; i < n2; ++i) | 
|  | 254 				if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { | 
|  | 255 					pre_op = bam_cigar_op(cigar2[i-2]); | 
|  | 256 					post_op = bam_cigar_op(cigar2[i]); | 
|  | 257 					/* Note don't need to check for X/= as code above will use M only */ | 
|  | 258 					if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { | 
|  | 259 						/* This is a redundant P operator */ | 
|  | 260 						cigar2[i-1] = 0; // i.e. 0M | 
|  | 261 						/* If had same operator either side, combine them in post_op */ | 
|  | 262 						if (pre_op == post_op) { | 
|  | 263 							/* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ | 
|  | 264 							cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); | 
|  | 265 							cigar2[i-2] = 0; // i.e. 0M | 
|  | 266 						} | 
|  | 267 					} | 
|  | 268 				} | 
|  | 269 			/* Remove the zero'd operators (0M) */ | 
|  | 270 			for (i = k = 0; i < n2; ++i) | 
|  | 271 				if (cigar2[i]) cigar2[k++] = cigar2[i]; | 
|  | 272 			n2 = k; | 
|  | 273 			replace_cigar(b, n2, cigar2); | 
|  | 274 			b->core.pos = posmap[b->core.pos]; | 
|  | 275 			if (b->core.mtid < 0 || b->core.mpos < 0) { | 
|  | 276 				/* Nice case, no mate to worry about*/ | 
|  | 277 				// fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b)); | 
|  | 278 				/* TODO - Warning if FLAG says mate should be mapped? */ | 
|  | 279 				/* Clean up funny input where mate position is given but mate reference is missing: */ | 
|  | 280 				b->core.mtid = -1; | 
|  | 281 				b->core.mpos = -1; | 
|  | 282 			} else if (b->core.mtid == b->core.tid) { | 
|  | 283 				/* Nice case, same reference */ | 
|  | 284 				// fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b)); | 
|  | 285 				b->core.mpos = posmap[b->core.mpos]; | 
|  | 286 			} else { | 
|  | 287 				/* Nasty case, Must load alternative posmap */ | 
|  | 288 				// fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); | 
|  | 289 				if (!fai) { | 
|  | 290 					fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); | 
|  | 291 					return -1; | 
|  | 292 				} | 
|  | 293 				/* Temporarily load the other reference sequence */ | 
|  | 294 				if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { | 
|  | 295 					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); | 
|  | 296 					return -1; | 
|  | 297 				} | 
|  | 298 				posmap = update_posmap(posmap, r); | 
|  | 299 				b->core.mpos = posmap[b->core.mpos]; | 
|  | 300 				/* Restore the reference and posmap*/ | 
|  | 301 				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { | 
|  | 302 					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); | 
|  | 303 					return -1; | 
|  | 304 				} | 
|  | 305 				posmap = update_posmap(posmap, r); | 
|  | 306 			} | 
|  | 307 		} | 
|  | 308 		samwrite(out, b); | 
|  | 309 	} | 
|  | 310 	if (read_ret < -1) { | 
|  | 311 		fprintf(stderr, "[depad] truncated file.\n"); | 
|  | 312 		ret = 1; | 
|  | 313 	} | 
|  | 314 	free(r.s); free(q.s); free(posmap); | 
|  | 315 	bam_destroy1(b); | 
|  | 316 	return ret; | 
|  | 317 } | 
|  | 318 | 
|  | 319 bam_header_t * fix_header(bam_header_t *old, faidx_t *fai) | 
|  | 320 { | 
|  | 321 	int i = 0, unpadded_len = 0; | 
|  | 322 	bam_header_t *header = 0 ; | 
|  | 323 | 
|  | 324 	header = bam_header_dup(old); | 
|  | 325 	for (i = 0; i < old->n_targets; ++i) { | 
|  | 326 		unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); | 
|  | 327 		if (unpadded_len < 0) { | 
|  | 328 			fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); | 
|  | 329 		} else { | 
|  | 330 			header->target_len[i] = unpadded_len; | 
|  | 331 			//fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); | 
|  | 332 		} | 
|  | 333 	} | 
|  | 334 	/* Duplicating the header allocated new buffer for header string */ | 
|  | 335 	/* After modifying the @SQ lines it will only get smaller, since */ | 
|  | 336 	/* the LN entries will be the same or shorter, and we'll remove */ | 
|  | 337 	/* any MD entries (MD5 checksums). */ | 
|  | 338 	assert(strlen(old->text) == strlen(header->text)); | 
|  | 339 	assert (0==strcmp(old->text, header->text)); | 
|  | 340 	const char *text; | 
|  | 341 	text = old->text; | 
|  | 342 	header->text[0] = '\0'; /* Resuse the allocated buffer */ | 
|  | 343 	char * newtext = header->text; | 
|  | 344 	char * end=NULL; | 
|  | 345 	while (text[0]=='@') { | 
|  | 346 		end = strchr(text, '\n'); | 
|  | 347 		assert(end != 0); | 
|  | 348 		if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { | 
|  | 349 			/* TODO - edit the @SQ line here to remove MD and fix LN. */ | 
|  | 350 			/* For now just remove the @SQ line, and samtools will */ | 
|  | 351 			/* automatically generate a minimal replacement with LN. */ | 
|  | 352 			/* However, that discards any other tags like AS, SP, UR. */ | 
|  | 353 			//fprintf(stderr, "[depad] Removing @SQ line\n"); | 
|  | 354 		} else { | 
|  | 355 			/* Copy this line to the new header */ | 
|  | 356 			strncat(newtext, text, end - text + 1); | 
|  | 357 		} | 
|  | 358 		text = end + 1; | 
|  | 359 	} | 
|  | 360 	assert (text[0]=='\0'); | 
|  | 361 	/* Check we didn't overflow the buffer */ | 
|  | 362 	assert (strlen(header->text) <= strlen(old->text)); | 
|  | 363 	if (strlen(header->text) < header->l_text) { | 
|  | 364 		//fprintf(stderr, "[depad] Reallocating header buffer\n"); | 
|  | 365 		assert (newtext == header->text); | 
|  | 366 		newtext = malloc(strlen(header->text) + 1); | 
|  | 367 		strcpy(newtext, header->text); | 
|  | 368 		free(header->text); | 
|  | 369 		header->text = newtext; | 
|  | 370 		header->l_text = strlen(newtext); | 
|  | 371 	} | 
|  | 372 	//fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); | 
|  | 373 	return header; | 
|  | 374 } | 
|  | 375 | 
|  | 376 static int usage(int is_long_help); | 
|  | 377 | 
|  | 378 int main_pad2unpad(int argc, char *argv[]) | 
|  | 379 { | 
|  | 380 	samfile_t *in = 0, *out = 0; | 
|  | 381         bam_header_t *h = 0; | 
|  | 382 	faidx_t *fai = 0; | 
|  | 383 	int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0; | 
|  | 384 	char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; | 
|  | 385         int ret=0; | 
|  | 386 | 
|  | 387 	/* parse command-line options */ | 
|  | 388 	strcpy(in_mode, "r"); strcpy(out_mode, "w"); | 
|  | 389 	while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) { | 
|  | 390 		switch (c) { | 
|  | 391 		case 'S': is_bamin = 0; break; | 
|  | 392 		case 's': assert(compress_level == -1); is_bamout = 0; break; | 
|  | 393 		case 'o': fn_out = strdup(optarg); break; | 
|  | 394 		case 'u': assert(is_bamout == 1); compress_level = 0; break; | 
|  | 395 		case '1': assert(is_bamout == 1); compress_level = 1; break; | 
|  | 396 		case 'T': fn_ref = strdup(optarg); break; | 
|  | 397                 case '?': is_long_help = 1; break; | 
|  | 398 		default: return usage(is_long_help); | 
|  | 399 		} | 
|  | 400         } | 
|  | 401 	if (argc == optind) return usage(is_long_help); | 
|  | 402 | 
|  | 403 	if (is_bamin) strcat(in_mode, "b"); | 
|  | 404 	if (is_bamout) strcat(out_mode, "b"); | 
|  | 405 	strcat(out_mode, "h"); | 
|  | 406 	if (compress_level >= 0) { | 
|  | 407 		char tmp[2]; | 
|  | 408 		tmp[0] = compress_level + '0'; tmp[1] = '\0'; | 
|  | 409 		strcat(out_mode, tmp); | 
|  | 410 	} | 
|  | 411 | 
|  | 412 	// Load FASTA reference (also needed for SAM -> BAM if missing header) | 
|  | 413 	if (fn_ref) { | 
|  | 414 		fn_list = samfaipath(fn_ref); | 
|  | 415 		fai = fai_load(fn_ref); | 
|  | 416 	} | 
|  | 417 	// open file handlers | 
|  | 418 	if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { | 
|  | 419 		fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); | 
|  | 420 		ret = 1; | 
|  | 421 		goto depad_end; | 
|  | 422 	} | 
|  | 423 	if (in->header == 0) { | 
|  | 424 		fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); | 
|  | 425 		ret = 1; | 
|  | 426 		goto depad_end; | 
|  | 427 	} | 
|  | 428 	if (in->header->text == 0 || in->header->l_text == 0) { | 
|  | 429 		fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]); | 
|  | 430 		assert (0 == in->header->l_text); | 
|  | 431 		assert (0 == in->header->text); | 
|  | 432 	} | 
|  | 433 	if (fn_ref) { | 
|  | 434 		h = fix_header(in->header, fai); | 
|  | 435 	} else { | 
|  | 436 		fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); | 
|  | 437 		h = in->header; | 
|  | 438 	} | 
|  | 439 	if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) { | 
|  | 440 		fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); | 
|  | 441 		ret = 1; | 
|  | 442 		goto depad_end; | 
|  | 443 	} | 
|  | 444 | 
|  | 445 	// Do the depad | 
|  | 446 	ret = bam_pad2unpad(in, out, fai); | 
|  | 447 | 
|  | 448 depad_end: | 
|  | 449 	// close files, free and return | 
|  | 450 	if (fai) fai_destroy(fai); | 
|  | 451 	if (h != in->header) bam_header_destroy(h); | 
|  | 452 	samclose(in); | 
|  | 453 	samclose(out); | 
|  | 454 	free(fn_list); free(fn_out); | 
|  | 455 	return ret; | 
|  | 456 } | 
|  | 457 | 
|  | 458 static int usage(int is_long_help) | 
|  | 459 { | 
|  | 460 	fprintf(stderr, "\n"); | 
|  | 461 	fprintf(stderr, "Usage:   samtools depad <in.bam>\n\n"); | 
|  | 462 	fprintf(stderr, "Options: -s       output is SAM (default is BAM)\n"); | 
|  | 463 	fprintf(stderr, "         -S       input is SAM (default is BAM)\n"); | 
|  | 464 	fprintf(stderr, "         -u       uncompressed BAM output (can't use with -s)\n"); | 
|  | 465 	fprintf(stderr, "         -1       fast compression BAM output (can't use with -s)\n"); | 
|  | 466 	fprintf(stderr, "         -T FILE  reference sequence file [null]\n"); | 
|  | 467 	fprintf(stderr, "         -o FILE  output file name [stdout]\n"); | 
|  | 468 	fprintf(stderr, "         -?       longer help\n"); | 
|  | 469 	fprintf(stderr, "\n"); | 
|  | 470 	if (is_long_help) | 
|  | 471 		fprintf(stderr, "Notes:\n\ | 
|  | 472 \n\ | 
|  | 473   1. Requires embedded reference sequences (before the reads for that reference),\n\ | 
|  | 474      with the future aim to also support a FASTA padded reference sequence file.\n\ | 
|  | 475 \n\ | 
|  | 476   2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\ | 
|  | 477 \n"); | 
|  | 478         return 1; | 
|  | 479 } |