Skip to content

Commit 2a804f3

Browse files
Jonathan Manningiontorrent-dev
authored andcommitted
Add samtools qa command, contributed by Roman Valls Guimera <[email protected]>.
Signed-off-by: Nils Homer <[email protected]>
1 parent 0f3207f commit 2a804f3

File tree

4 files changed

+389
-3
lines changed

4 files changed

+389
-3
lines changed

Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ KNETFILE_O= knetfile.o
66
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
77
bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \
88
$(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
9-
AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
10-
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
9+
AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
10+
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
1111
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
12-
cut_target.o phase.o bam2depth.o padding.o
12+
cut_target.o phase.o bam2depth.o bam_qa.o padding.o
1313
PROG= samtools
1414
INCLUDES= -I.
1515
SUBDIRS= . bcftools misc
@@ -62,6 +62,7 @@ bam_lpileup.o:bam.h ksort.h
6262
bam_tview.o:bam.h faidx.h
6363
bam_sort.o:bam.h ksort.h razf.h
6464
bam_md.o:bam.h faidx.h
65+
bam_qa.o:sam.h radix.h
6566
sam_header.o:sam_header.h khash.h
6667
bcf.o:bcftools/bcf.h
6768
bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h

bam_qa.c

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
#include <stdio.h>
2+
#include "radix.h"
3+
#include "sam.h"
4+
5+
typedef struct
6+
{
7+
int printAll,doMedian,maxCoverage;
8+
} Options;
9+
10+
/**
11+
* Check if read is properly mapped
12+
* @return true if read mapped, false otherwise
13+
*/
14+
static inline int is_mapped(const bam1_core_t *core)
15+
{
16+
return !(core->flag&BAM_FUNMAP);
17+
}
18+
19+
/**
20+
* Print usage instructions
21+
*/
22+
static int print_usage()
23+
{
24+
fprintf(stderr, "\n");
25+
fprintf(stderr, "Usage: samtools qa [options] <in.bam> <output.out>\n");
26+
fprintf(stderr, "Options: -a Don't print alternate assemblies to the output file (for human genome)\n");
27+
fprintf(stderr, " -m Also compute median coverage\n");
28+
fprintf(stderr, " -c [INT] Maximum coverage to consider in histogram [30]\n");
29+
fprintf(stderr, "\n");
30+
fprintf(stderr, "Note: Input file should be sorted\n\n");
31+
return 1;
32+
}
33+
34+
static void compute_print_cov(FILE* outputFile, Options userOpt, int* data, char* name,const uint32_t chrSize, int64_t* coverageHist,const int currentTid)
35+
{
36+
int32_t covVal = 0;
37+
int64_t covSum = 0;
38+
int32_t i;
39+
40+
//Go through chromosome and count avarage covarage.
41+
for (i=0; i<chrSize; ++i){
42+
covVal += data[i];
43+
//This will be sorted later.
44+
//If -m was not defined, this is useless, but cheaper than an 'if'
45+
data[i] = covVal;
46+
covSum += covVal;
47+
//Add value to histogram
48+
if (covVal > userOpt.maxCoverage) {
49+
++coverageHist[userOpt.maxCoverage];
50+
} else {
51+
++coverageHist[covVal];
52+
}
53+
54+
}
55+
if (userOpt.doMedian)
56+
//Sort entireChr
57+
radix_sort(data, chrSize);
58+
59+
//Printout avarage coverage over this chrom
60+
printf("Average coverage over %s : %3.2f\n", name, (double)covSum / chrSize);
61+
if (userOpt.doMedian)
62+
printf("Median coverage over %s : %d\n", name, data[chrSize/2]);
63+
if (userOpt.printAll == 1) {
64+
if (userOpt.doMedian)
65+
fprintf(outputFile, "%s\t%d\t%3.5f\t%d\n", name, chrSize, (double)covSum / chrSize, data[chrSize/2]);
66+
else
67+
fprintf(outputFile, "%s\t%d\t%3.5f\n", name, chrSize, (double)covSum / chrSize);
68+
} else if (currentTid < 24) {
69+
//Don't print alternate assemblies to the file
70+
//This is human genome specific
71+
if (userOpt.doMedian)
72+
fprintf(outputFile, "%s\t%d\t%3.5f\t%d\n", name, chrSize, (double)covSum / chrSize, data[chrSize/2]);
73+
else
74+
fprintf(outputFile, "%s\t%d\t%3.5f\n", name, chrSize, (double)covSum / chrSize);
75+
}
76+
}
77+
78+
/**
79+
* Main of app
80+
*/
81+
int main_qa(int argc, char *argv[])
82+
{
83+
samfile_t *fp;
84+
FILE *outputFile;
85+
Options userOpt;
86+
userOpt.printAll = 1;
87+
userOpt.doMedian = 0;
88+
userOpt.maxCoverage = 30;
89+
int arg;
90+
//Get args
91+
while ((arg = getopt(argc, argv, "amc:")) >= 0) {
92+
switch (arg) {
93+
case 'a': userOpt.printAll = 0; break;
94+
case 'm': userOpt.doMedian = 1; break;
95+
case 'c': userOpt.maxCoverage = atoi(optarg); break;
96+
}
97+
}
98+
99+
if (argc-optind != 2) {
100+
print_usage();
101+
return 1;
102+
}
103+
104+
//Note that file is supposed to have been ordered beforehand!
105+
if ((fp = samopen(argv[optind], "rb", 0)) == 0) {
106+
fprintf(stderr, "qaCompute: Fail to open BAM file %s\n", argv[1]);
107+
return 1;
108+
}
109+
if ((outputFile = fopen(argv[optind+1], "wt")) == 0) {
110+
fprintf(stderr, "qaCompute: Filed to create output file %s\n", argv[2]);
111+
return 1;
112+
}
113+
114+
115+
//Initialize bam entity
116+
bam1_t *b = bam_init1();
117+
118+
//All var declarations
119+
int64_t totalGenomeLength = 0;
120+
int32_t unmappedReads = 0;
121+
int32_t zeroQualityReads = 0;
122+
int32_t totalNumberOfReads = 0;
123+
int32_t totalProperPaires = 0;
124+
uint32_t chrSize = 0;
125+
126+
int32_t duplicates = 0;
127+
128+
int *entireChr = NULL;
129+
//Keep header for further reference
130+
bam_header_t* head = fp->header;
131+
132+
int32_t currentTid = -1;
133+
134+
//Create "map" vector for histogram
135+
int64_t* coverageHist = (int64_t*)malloc((userOpt.maxCoverage+1)*sizeof(int64_t));
136+
memset( coverageHist, 0, (userOpt.maxCoverage+1)*sizeof(int64_t));
137+
138+
//Write file table header
139+
if (userOpt.doMedian == 1)
140+
fprintf(outputFile, "Chromosome\tSeq_len\tAvg_Cov\tMedian_Cov\n");
141+
else
142+
fprintf(outputFile, "Chromosome\tSeq_lem\tAvg_Cov\n");
143+
144+
while (samread(fp, b) >= 0) {
145+
146+
//uint32_t* cigar = bam1_cigar(b);
147+
148+
//Get bam core.
149+
const bam1_core_t *core = &b->core;
150+
151+
if (core == NULL) {
152+
//There is something wrong with the read/file
153+
printf("Input file is corrupt!");
154+
//Leak everything and exit!
155+
return -1;
156+
}
157+
158+
//BAM block has been read
159+
if (!is_mapped(core))
160+
++unmappedReads;
161+
else {
162+
163+
if (core->tid != currentTid) {
164+
165+
//Count coverage!
166+
if (currentTid != -1) {
167+
compute_print_cov(outputFile, userOpt, entireChr, head->target_name[currentTid], chrSize, coverageHist, currentTid);
168+
}
169+
170+
//Get length of next section
171+
chrSize = head->target_len[core->tid];
172+
totalGenomeLength += chrSize;
173+
printf("Computing %s of size %d... \n",head->target_name[core->tid],chrSize);
174+
175+
//Done with current section.
176+
//Allocate memory
177+
entireChr = (int*)realloc(entireChr, (chrSize+1)*sizeof(int));
178+
179+
if (entireChr == NULL) {
180+
printf("Allocation failed! \n");
181+
return -1;
182+
}
183+
memset(entireChr, 0, (chrSize+1)*sizeof(int));
184+
185+
currentTid = core->tid;
186+
187+
}
188+
189+
//If read has quality == 0, we won't count it as mapped
190+
if (core->qual != 0) {
191+
if (core->flag&BAM_FPROPER_PAIR) {
192+
//Is part of a proper pair
193+
++totalProperPaires;
194+
}
195+
196+
if (core->flag&BAM_FDUP) {
197+
//This is a duplicate. Don't count it!.
198+
++duplicates;
199+
} else {
200+
//All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
201+
++entireChr[core->pos];
202+
203+
if (core->pos+core->l_qseq >= chrSize)
204+
--entireChr[chrSize-1];
205+
else
206+
--entireChr[core->pos+core->l_qseq];
207+
}
208+
209+
} else {
210+
//Count is as unmapped?
211+
++zeroQualityReads;
212+
}
213+
}
214+
215+
++totalNumberOfReads;
216+
217+
}
218+
219+
//Compute coverage for the last "chromosome"
220+
compute_print_cov(outputFile, userOpt, entireChr, head->target_name[currentTid], chrSize, coverageHist, currentTid);
221+
222+
bam_destroy1(b);
223+
free(entireChr);
224+
225+
printf("\n Duplicates:%d \n", duplicates);
226+
227+
//Print header for next table in output file
228+
fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n");
229+
230+
printf("Total genome lenght %ld \n", totalGenomeLength);
231+
//Compute procentages of genome cover!
232+
int i = 0;
233+
for (; i <= userOpt.maxCoverage; ++i) {
234+
if (i == 0) {
235+
//Non-covered!
236+
printf("%3.2f of genome has not been covered\n", (double)(100*coverageHist[i])/totalGenomeLength);
237+
} else {
238+
int64_t coverage = 0;
239+
//All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition
240+
int x = i;
241+
for (; x <= userOpt.maxCoverage; ++x)
242+
coverage += coverageHist[x];
243+
printf("%3.2f of genome has been covered at least %dX \n", (double)(100*coverage)/totalGenomeLength, i);
244+
fprintf(outputFile,"%d\t%3.5f\t%ld\n",i, (double)(100*coverage)/totalGenomeLength, coverageHist[i]);
245+
}
246+
}
247+
248+
fprintf(outputFile,"\nOther\n");
249+
250+
//Printout procentage of mapped/unmapped reads
251+
double procentageOfUnmapped = (100*unmappedReads)/totalNumberOfReads;
252+
double procentageOfZeroQuality = (100*zeroQualityReads)/totalNumberOfReads;
253+
fprintf(outputFile,"Total number of reads: %d\n", totalNumberOfReads);
254+
fprintf(outputFile,"Total number of duplicates found and ignored: %d\n", duplicates);
255+
fprintf(outputFile,"Percentage of unmapped reads: %3.5f\n", procentageOfUnmapped);
256+
fprintf(outputFile,"Percentage of zero quality mappings: %3.5f\n", procentageOfZeroQuality);
257+
int32_t nrOfPaires = totalNumberOfReads/2;
258+
double procOfProperPaires = (double)(100*(double)totalProperPaires/2)/nrOfPaires;
259+
fprintf(outputFile,"Number of proper paired reads: %d\n", totalProperPaires);
260+
fprintf(outputFile,"Percentage of proper pairs: %3.5f\n", procOfProperPaires);
261+
262+
printf("Out of %d reads, you have %3.5f unmapped reads\n and %3.5f zero quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
263+
264+
265+
free(coverageHist);
266+
267+
268+
samclose(fp);
269+
fclose(outputFile);
270+
return 0;
271+
}

bamtk.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ int main_cut_target(int argc, char *argv[]);
2626
int main_phase(int argc, char *argv[]);
2727
int main_cat(int argc, char *argv[]);
2828
int main_depth(int argc, char *argv[]);
29+
int main_qa(int argc, char* argv[]);
2930
int main_bam2fq(int argc, char *argv[]);
3031
int main_pad2unpad(int argc, char *argv[]);
3132

@@ -53,6 +54,7 @@ static int usage()
5354
fprintf(stderr, " merge merge sorted alignments\n");
5455
fprintf(stderr, " rmdup remove PCR duplicates\n");
5556
fprintf(stderr, " reheader replace BAM header\n");
57+
fprintf(stderr, " qa quality control\n");
5658
fprintf(stderr, " cat concatenate BAMs\n");
5759
fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
5860
fprintf(stderr, " phase phase heterozygotes\n");
@@ -105,6 +107,7 @@ int main(int argc, char *argv[])
105107
#if _CURSES_LIB != 0
106108
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
107109
#endif
110+
else if (strcmp(argv[1], "qa") == 0) return main_qa(argc-1, argv+1);
108111
else {
109112
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
110113
return 1;

0 commit comments

Comments
 (0)