Skip to content

Commit accf026

Browse files
committed
Merge pull request lh3#8 from nh13/master
Patches
2 parents 9134e0d + c554160 commit accf026

File tree

15 files changed

+682
-31
lines changed

15 files changed

+682
-31
lines changed

Makefile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
1-
CC= gcc
2-
CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
1+
CC?= gcc
2+
CFLAGS?= -g -Wall -O2 -march=nocona -pipe
3+
LDFLAGS?= -Wl,-rpath,\$$ORIGIN/../lib
34
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1
45
KNETFILE_O= knetfile.o
56
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
67
bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \
78
$(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
8-
AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
9-
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
9+
AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
10+
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
1011
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
11-
cut_target.o phase.o bam2depth.o padding.o
12+
cut_target.o phase.o bam2depth.o bam_qa.o bam_sample.o padding.o
1213
PROG= samtools
1314
INCLUDES= -I.
1415
SUBDIRS= . bcftools misc
1516
LIBPATH=
1617
LIBCURSES= -lcurses # -lXCurses
1718

1819
.SUFFIXES:.c .o
20+
.PHONY: all lib
1921

2022
.c.o:
2123
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
@@ -41,7 +43,7 @@ libbam.a:$(LOBJS)
4143
$(AR) -csru $@ $(LOBJS)
4244

4345
samtools:lib-recur $(AOBJS)
44-
$(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz
46+
$(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz
4547

4648
razip:razip.o razf.o $(KNETFILE_O)
4749
$(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz
@@ -60,6 +62,7 @@ bam_lpileup.o:bam.h ksort.h
6062
bam_tview.o:bam.h faidx.h
6163
bam_sort.o:bam.h ksort.h razf.h
6264
bam_md.o:bam.h faidx.h
65+
bam_qa.o:sam.h radix.h
6366
sam_header.o:sam_header.h khash.h
6467
bcf.o:bcftools/bcf.h
6568
bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
## This is *NOT* the official repository of SAMtools.
2+
The official SAMtools repository can be found at: http://samtools.sourceforge.net/
3+
4+
## Major fixes:
5+
- added "samtools sample" command, to sample reads from a SAM/BAM file at a given frequency.
6+
- added "samtools qa" command, to compute the mean and median coverage, as well a histogram
7+
from 1 to N (defined by param) containing the number of bases covered a maximum of 1X, 2X...NX.
8+
Furthermore, "other" information is also available in the output file, namely:
9+
- Total number of reads
10+
- Total number of duplicates found and ignored (duplicates are "found" based on the sam flag
11+
and are ignored in the counting of coverage)
12+
- Percentage of unmapped reads
13+
- Percentage of zero quality mappings
14+
- Number of proper paired reads (based on sam flag of proper pairs)
15+
- Percentage of proper pairs.e
16+
17+
## Minor fixes:
18+
- Check the write filehandle after opening for write.
19+
- allow for user defined [lowercase] tags in header elements.
20+
- allow the maximum memory for "samtools sort" to be specified with units.
21+
- adjust for leading hard clip on colorspace reads.
22+
- catches and reports an invalid BAM header, instead of segfaulting later on.
23+
- fixes a small underflow/overflow bug in integer parsing.
24+
- checks for a lowerbound in text entry box to avoid segfault in tview.

bam2depth.c

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,57 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here
2929
return ret;
3030
}
3131

32+
typedef struct {
33+
int32_t bin;
34+
int32_t bin_idx;
35+
int32_t tid;
36+
int32_t bin_size;
37+
} circos_t;
38+
39+
static void circos_print(circos_t *circos, bam_header_t *h)
40+
{
41+
if (circos->tid < 0 || 0 == circos->bin) return;
42+
// NB: this could be faster with custom routines
43+
fputs(h->target_name[circos->tid], stdout);
44+
printf("\t%d\t%d\t%f\n",
45+
(circos->bin_idx * circos->bin_size) + 1,
46+
(circos->bin_idx + 1) * circos->bin_size,
47+
circos->bin / (double)circos->bin_size);
48+
}
49+
3250
#ifdef _MAIN_BAM2DEPTH
3351
int main(int argc, char *argv[])
3452
#else
3553
int main_depth(int argc, char *argv[])
3654
#endif
3755
{
38-
int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
56+
int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, use_circos=0;
3957
const bam_pileup1_t **plp;
4058
char *reg = 0; // specified region
4159
void *bed = 0; // BED data structure
4260
bam_header_t *h = 0; // BAM header of the 1st input
4361
aux_t **data;
4462
bam_mplp_t mplp;
63+
circos_t circos; circos.bin_size = 10000;
4564

4665
// parse the command line
47-
while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
66+
while ((n = getopt(argc, argv, "r:b:q:Q:cB:")) >= 0) {
4867
switch (n) {
4968
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
5069
case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
5170
case 'q': baseQ = atoi(optarg); break; // base quality threshold
5271
case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
72+
case 'c': use_circos = 1; break; // circos output
73+
case 'B': circos.bin_size = atoi(optarg); break; // circos bin size
5374
}
5475
}
5576
if (optind == argc) {
56-
fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
77+
fprintf(stderr, "Usage: depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [-c [-B binSize]] <in1.bam> [...]\n");
5778
return 1;
5879
}
5980

6081
// initialize the auxiliary data structures
82+
if (use_circos) circos.bin = circos.bin_idx = 0; circos.tid = -1;
6183
n = argc - optind; // the number of BAMs on the command line
6284
data = calloc(n, sizeof(void*)); // data[i] for the i-th input
6385
beg = 0; end = 1<<30; tid = -1; // set the default region
@@ -83,22 +105,39 @@ int main_depth(int argc, char *argv[])
83105
n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
84106
plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
85107
while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
108+
int32_t cov = 0;
86109
if (pos < beg || pos >= end) continue; // out of range; skip
87110
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
88-
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
89-
for (i = 0; i < n; ++i) { // base level filters have to go here
90-
int j, m = 0;
91-
for (j = 0; j < n_plp[i]; ++j) {
92-
const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
93-
if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
94-
else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
95-
}
96-
printf("\t%d", n_plp[i] - m); // this the depth to output
97-
}
98-
putchar('\n');
111+
if (0 == use_circos) { fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); } // a customized printf() would be faster
112+
for (i = 0; i < n; ++i) { // base level filters have to go here
113+
int j, m = 0;
114+
for (j = 0; j < n_plp[i]; ++j) {
115+
const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
116+
if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
117+
else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
118+
}
119+
if (0 == use_circos) printf("\t%d", n_plp[i] - m); // this the depth to output
120+
else cov += (n_plp[i] - m);
121+
}
122+
if (0 == use_circos) putchar('\n');
123+
else {
124+
pos++; // make one-based
125+
int32_t bin_idx = ((pos - (pos % circos.bin_size)) / circos.bin_size);
126+
if (tid == circos.tid && bin_idx == circos.bin_idx) {
127+
circos.bin += cov; // this is the depth to output
128+
}
129+
else {
130+
circos_print(&circos, h); // print
131+
// update
132+
circos.bin = cov; // this is the depth to output
133+
circos.bin_idx = bin_idx;
134+
circos.tid = tid;
135+
}
136+
}
99137
}
100138
free(n_plp); free(plp);
101139
bam_mplp_destroy(mplp);
140+
if (1 == use_circos) circos_print(&circos, h); // print
102141

103142
bam_header_destroy(h);
104143
for (i = 0; i < n; ++i) {

bam_color.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,14 @@ char bam_aux_getCSi(bam1_t *b, int i)
1919

2020
cs = bam_aux2Z(c);
2121
// adjust for strandedness and leading adaptor
22-
if(bam1_strand(b)) i = strlen(cs) - 1 - i;
23-
else i++;
22+
if(bam1_strand(b)) {
23+
i = strlen(cs) - 1 - i;
24+
// adjust for leading hard clip
25+
uint32_t cigar = bam1_cigar(b)[0];
26+
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
27+
i -= cigar >> BAM_CIGAR_SHIFT;
28+
}
29+
} else { i++; }
2430
return cs[i];
2531
}
2632

@@ -42,7 +48,14 @@ char bam_aux_getCQi(bam1_t *b, int i)
4248

4349
cq = bam_aux2Z(c);
4450
// adjust for strandedness
45-
if(bam1_strand(b)) i = strlen(cq) - 1 - i;
51+
if(bam1_strand(b)) {
52+
i = strlen(cq) - 1 - i;
53+
// adjust for leading hard clip
54+
uint32_t cigar = bam1_cigar(b)[0];
55+
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
56+
i -= (cigar >> BAM_CIGAR_SHIFT);
57+
}
58+
}
4659
return cq[i];
4760
}
4861

@@ -98,6 +111,11 @@ char bam_aux_getCEi(bam1_t *b, int i)
98111
// adjust for strandedness and leading adaptor
99112
if(bam1_strand(b)) { //reverse strand
100113
cs_i = strlen(cs) - 1 - i;
114+
// adjust for leading hard clip
115+
uint32_t cigar = bam1_cigar(b)[0];
116+
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
117+
cs_i -= cigar >> BAM_CIGAR_SHIFT;
118+
}
101119
// get current color
102120
cur_color = cs[cs_i];
103121
// get previous base. Note: must rc adaptor

bam_index.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,14 @@ bam_index_t *bam_index_core(bamFile fp)
159159
bam1_core_t *c;
160160
uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
161161

162+
h = bam_header_read(fp);
163+
if(h == 0) {
164+
fprintf(stderr, "[bam_index_core] Invalid BAM header.");
165+
return NULL;
166+
}
167+
162168
idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163169
b = (bam1_t*)calloc(1, sizeof(bam1_t));
164-
h = bam_header_read(fp);
165170
c = &b->core;
166171

167172
idx->n = h->n_targets;

0 commit comments

Comments
 (0)