Skip to content

Commit 152119b

Browse files
author
Heng Li
committed
better consensus; a little more robust
1 parent 454da47 commit 152119b

File tree

2 files changed

+44
-27
lines changed

2 files changed

+44
-27
lines changed

bam.c

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -367,11 +367,11 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b)
367367

368368
int bam_remove_B(bam1_t *b)
369369
{
370-
int i, j, old_j, k, l, ins_B = 0;
370+
int i, j, end_j, k, l, no_qual;
371371
uint32_t *cigar, *new_cigar;
372372
uint8_t *seq, *qual, *p;
373373
// test if removal is necessary
374-
if (b->core.flag & BAM_FUNMAP) return 0; // unmapped
374+
if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
375375
cigar = bam1_cigar(b);
376376
for (k = 0; k < b->core.n_cigar; ++k)
377377
if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
@@ -387,45 +387,64 @@ int bam_remove_B(bam1_t *b)
387387
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
388388
// the core loop
389389
seq = bam1_seq(b); qual = bam1_qual(b);
390-
i = j = 0; old_j = -1;
390+
no_qual = (qual[0] == 0xff); // test whether base quality is available
391+
i = j = 0; end_j = -1;
391392
for (k = l = 0; k < b->core.n_cigar; ++k) {
392393
int op = bam_cigar_op(cigar[k]);
393394
int len = bam_cigar_oplen(cigar[k]);
394-
if (op == BAM_CBACK) {
395+
if (op == BAM_CBACK) { // the backward operation
395396
int t, u = 0, v = 0;
397+
if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
396398
for (t = l - 1; t >= 0; --t) { // look back
397399
int op0 = bam_cigar_op(new_cigar[t]);
398400
int len0 = bam_cigar_oplen(new_cigar[t]);
399-
if (bam_cigar_type(op0)&2) {
401+
if (bam_cigar_type(op0)&2) { // consume the reference
400402
if (u + len0 >= len) break;
401403
u += len0;
402404
}
403-
if (bam_cigar_type(op0)&1) v += len0;
405+
if (bam_cigar_type(op0)&1) v += len0; // consume the query
404406
}
405407
if (t < 0) goto rmB_err; // a long backward operation
406-
old_j = j;
407-
j = old_j - v - ((bam_cigar_type(bam_cigar_op(new_cigar[t]))&1)? len - u : 0);
408+
end_j = j;
409+
j = end_j - v - ((bam_cigar_type(bam_cigar_op(new_cigar[t]))&1)? len - u : 0);
408410
new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT; // FIXME: use bam_cigar_gen()
409-
if (new_cigar[t] == 0 && t >= 1 && bam_cigar_op(new_cigar[t-1]) == BAM_CINS) ins_B = 1;
411+
if (bam_cigar_oplen(new_cigar[t]) == 0 && t >= 1) --t; // squeeze out the zero-length operation
412+
if (bam_cigar_op(cigar[k+1]) == BAM_CINS) { // the next operation is I, move back further
413+
int len0, len1;
414+
if (bam_cigar_op(new_cigar[t]) != BAM_CINS) goto rmB_err; // inconsistent CIGAR
415+
len1 = bam_cigar_oplen(cigar[k+1]);
416+
len0 = bam_cigar_oplen(new_cigar[t]);
417+
if (len1 > len0) goto rmB_err; // CIGAR like: 3M1I1M1B2I10M; cannot be resolved
418+
new_cigar[t] -= len1 << BAM_CIGAR_SHIFT; // FIXME: use bam_cigar_gen()
419+
j -= len1;
420+
}
410421
l = t + 1;
411-
} else {
412-
if (ins_B && op == BAM_CINS) goto rmB_err; // FIXME: this can be resolved; just a bit complicated
413-
ins_B = 0;
422+
} else { // other CIGAR operations
414423
new_cigar[l++] = cigar[k];
415424
if (bam_cigar_type(op)&1) { // consume the query
416425
if (i != j) { // no need to copy if i == j
417-
int u;
418-
for (u = 0; u < len; ++u)
419-
if (j + u >= old_j || qual[j + u] < qual[i + u]) {
420-
int c = bam1_seqi(seq, i + u);
421-
bam1_seq_seti(seq, j + u, c);
422-
qual[j + u] = qual[i + u];
426+
int u, c, c0;
427+
for (u = 0; u < len; ++u) { // construct the consensus
428+
c = bam1_seqi(seq, i+u);
429+
if (j + u < end_j) { // in an overlap
430+
c0 = bam1_seqi(seq, j+u);
431+
if (c != c0) { // a mismatch; choose the better base
432+
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
433+
bam1_seq_seti(seq, j+u, c);
434+
qual[j+u] = qual[i+u] - qual[j+u];
435+
} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
436+
} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
437+
} else { // not in an overlap; copy over
438+
bam1_seq_seti(seq, j+u, c);
439+
qual[j+u] = qual[i+u];
423440
}
441+
}
424442
}
425443
i += len, j += len;
426444
}
427445
}
428446
}
447+
if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
429448
// merge adjacent operations if possible
430449
for (k = 1; k < l; ++k)
431450
if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
@@ -436,15 +455,13 @@ int bam_remove_B(bam1_t *b)
436455
new_cigar[i++] = new_cigar[k];
437456
l = i;
438457
// update b
439-
memcpy(cigar, new_cigar, l * 4);
440-
i = (j + 1) >> 1;
458+
memcpy(cigar, new_cigar, l * 4); // set CIGAR
441459
p = b->data + b->core.l_qname + l * 4;
442-
memmove(p, seq, i); p += i;
443-
memmove(p, qual, j); p += j;
444-
memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux;
445-
b->core.n_cigar = l;
446-
b->core.l_qseq = j;
447-
b->data_len = p - b->data;
460+
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
461+
memmove(p, qual, j); p += j; // set QUAL
462+
memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
463+
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
464+
b->data_len = p - b->data; // update record length
448465
return 0;
449466

450467
rmB_err:

bam.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
@copyright Genome Research Ltd.
4141
*/
4242

43-
#define BAM_VERSION "0.1.18 (r982:302)"
43+
#define BAM_VERSION "0.1.18-dev (r982:306)"
4444

4545
#include <stdint.h>
4646
#include <stdlib.h>

0 commit comments

Comments
 (0)