@@ -367,11 +367,11 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b)
367367
368368int bam_remove_B (bam1_t * b )
369369{
370- int i , j , old_j , k , l , ins_B = 0 ;
370+ int i , j , end_j , k , l , no_qual ;
371371 uint32_t * cigar , * new_cigar ;
372372 uint8_t * seq , * qual , * p ;
373373 // test if removal is necessary
374- if (b -> core .flag & BAM_FUNMAP ) return 0 ; // unmapped
374+ if (b -> core .flag & BAM_FUNMAP ) return 0 ; // unmapped; do nothing
375375 cigar = bam1_cigar (b );
376376 for (k = 0 ; k < b -> core .n_cigar ; ++ k )
377377 if (bam_cigar_op (cigar [k ]) == BAM_CBACK ) break ;
@@ -387,45 +387,64 @@ int bam_remove_B(bam1_t *b)
387387 new_cigar = (uint32_t * )(b -> data + (b -> m_data - b -> core .n_cigar * 4 )); // from the end of b->data
388388 // the core loop
389389 seq = bam1_seq (b ); qual = bam1_qual (b );
390- i = j = 0 ; old_j = -1 ;
390+ no_qual = (qual [0 ] == 0xff ); // test whether base quality is available
391+ i = j = 0 ; end_j = -1 ;
391392 for (k = l = 0 ; k < b -> core .n_cigar ; ++ k ) {
392393 int op = bam_cigar_op (cigar [k ]);
393394 int len = bam_cigar_oplen (cigar [k ]);
394- if (op == BAM_CBACK ) {
395+ if (op == BAM_CBACK ) { // the backward operation
395396 int t , u = 0 , v = 0 ;
397+ if (k == b -> core .n_cigar - 1 ) break ; // ignore 'B' at the end of CIGAR
396398 for (t = l - 1 ; t >= 0 ; -- t ) { // look back
397399 int op0 = bam_cigar_op (new_cigar [t ]);
398400 int len0 = bam_cigar_oplen (new_cigar [t ]);
399- if (bam_cigar_type (op0 )& 2 ) {
401+ if (bam_cigar_type (op0 )& 2 ) { // consume the reference
400402 if (u + len0 >= len ) break ;
401403 u += len0 ;
402404 }
403- if (bam_cigar_type (op0 )& 1 ) v += len0 ;
405+ if (bam_cigar_type (op0 )& 1 ) v += len0 ; // consume the query
404406 }
405407 if (t < 0 ) goto rmB_err ; // a long backward operation
406- old_j = j ;
407- j = old_j - v - ((bam_cigar_type (bam_cigar_op (new_cigar [t ]))& 1 )? len - u : 0 );
408+ end_j = j ;
409+ j = end_j - v - ((bam_cigar_type (bam_cigar_op (new_cigar [t ]))& 1 )? len - u : 0 );
408410 new_cigar [t ] -= (len - u ) << BAM_CIGAR_SHIFT ; // FIXME: use bam_cigar_gen()
409- if (new_cigar [t ] == 0 && t >= 1 && bam_cigar_op (new_cigar [t - 1 ]) == BAM_CINS ) ins_B = 1 ;
411+ if (bam_cigar_oplen (new_cigar [t ]) == 0 && t >= 1 ) -- t ; // squeeze out the zero-length operation
412+ if (bam_cigar_op (cigar [k + 1 ]) == BAM_CINS ) { // the next operation is I, move back further
413+ int len0 , len1 ;
414+ if (bam_cigar_op (new_cigar [t ]) != BAM_CINS ) goto rmB_err ; // inconsistent CIGAR
415+ len1 = bam_cigar_oplen (cigar [k + 1 ]);
416+ len0 = bam_cigar_oplen (new_cigar [t ]);
417+ if (len1 > len0 ) goto rmB_err ; // CIGAR like: 3M1I1M1B2I10M; cannot be resolved
418+ new_cigar [t ] -= len1 << BAM_CIGAR_SHIFT ; // FIXME: use bam_cigar_gen()
419+ j -= len1 ;
420+ }
410421 l = t + 1 ;
411- } else {
412- if (ins_B && op == BAM_CINS ) goto rmB_err ; // FIXME: this can be resolved; just a bit complicated
413- ins_B = 0 ;
422+ } else { // other CIGAR operations
414423 new_cigar [l ++ ] = cigar [k ];
415424 if (bam_cigar_type (op )& 1 ) { // consume the query
416425 if (i != j ) { // no need to copy if i == j
417- int u ;
418- for (u = 0 ; u < len ; ++ u )
419- if (j + u >= old_j || qual [j + u ] < qual [i + u ]) {
420- int c = bam1_seqi (seq , i + u );
421- bam1_seq_seti (seq , j + u , c );
422- qual [j + u ] = qual [i + u ];
426+ int u , c , c0 ;
427+ for (u = 0 ; u < len ; ++ u ) { // construct the consensus
428+ c = bam1_seqi (seq , i + u );
429+ if (j + u < end_j ) { // in an overlap
430+ c0 = bam1_seqi (seq , j + u );
431+ if (c != c0 ) { // a mismatch; choose the better base
432+ if (qual [j + u ] < qual [i + u ]) { // the base in the 2nd segment is better
433+ bam1_seq_seti (seq , j + u , c );
434+ qual [j + u ] = qual [i + u ] - qual [j + u ];
435+ } else qual [j + u ] -= qual [i + u ]; // the 1st is better; reduce base quality
436+ } else qual [j + u ] = qual [j + u ] > qual [i + u ]? qual [j + u ] : qual [i + u ];
437+ } else { // not in an overlap; copy over
438+ bam1_seq_seti (seq , j + u , c );
439+ qual [j + u ] = qual [i + u ];
423440 }
441+ }
424442 }
425443 i += len , j += len ;
426444 }
427445 }
428446 }
447+ if (no_qual ) qual [0 ] = 0xff ; // in very rare cases, this may be modified
429448 // merge adjacent operations if possible
430449 for (k = 1 ; k < l ; ++ k )
431450 if (bam_cigar_op (new_cigar [k ]) == bam_cigar_op (new_cigar [k - 1 ]))
@@ -436,15 +455,13 @@ int bam_remove_B(bam1_t *b)
436455 new_cigar [i ++ ] = new_cigar [k ];
437456 l = i ;
438457 // update b
439- memcpy (cigar , new_cigar , l * 4 );
440- i = (j + 1 ) >> 1 ;
458+ memcpy (cigar , new_cigar , l * 4 ); // set CIGAR
441459 p = b -> data + b -> core .l_qname + l * 4 ;
442- memmove (p , seq , i ); p += i ;
443- memmove (p , qual , j ); p += j ;
444- memmove (p , bam1_aux (b ), b -> l_aux ); p += b -> l_aux ;
445- b -> core .n_cigar = l ;
446- b -> core .l_qseq = j ;
447- b -> data_len = p - b -> data ;
460+ memmove (p , seq , (j + 1 )>>1 ); p += (j + 1 )>>1 ; // set SEQ
461+ memmove (p , qual , j ); p += j ; // set QUAL
462+ memmove (p , bam1_aux (b ), b -> l_aux ); p += b -> l_aux ; // set optional fields
463+ b -> core .n_cigar = l , b -> core .l_qseq = j ; // update CIGAR length and query length
464+ b -> data_len = p - b -> data ; // update record length
448465 return 0 ;
449466
450467rmB_err :
0 commit comments