|
| 1 | +#include <stdio.h> |
| 2 | +#include "radix.h" |
| 3 | +#include "sam.h" |
| 4 | + |
| 5 | +typedef struct |
| 6 | +{ |
| 7 | + int printAll,doMedian,maxCoverage; |
| 8 | +} Options; |
| 9 | + |
| 10 | +/** |
| 11 | + * Check if read is properly mapped |
| 12 | + * @return true if read mapped, false otherwise |
| 13 | + */ |
| 14 | +static inline int is_mapped(const bam1_core_t *core) |
| 15 | +{ |
| 16 | + return !(core->flag&BAM_FUNMAP); |
| 17 | +} |
| 18 | + |
| 19 | +/** |
| 20 | + * Print usage instructions |
| 21 | + */ |
| 22 | +static int print_usage() |
| 23 | +{ |
| 24 | + fprintf(stderr, "\n"); |
| 25 | + fprintf(stderr, "Usage: samtools qa [options] <in.bam> <output.out>\n"); |
| 26 | + fprintf(stderr, "Options: -a Don't print alternate assemblies to the output file (for human genome)\n"); |
| 27 | + fprintf(stderr, " -m Also compute median coverage\n"); |
| 28 | + fprintf(stderr, " -c [INT] Maximum coverage to consider in histogram [30]\n"); |
| 29 | + fprintf(stderr, "\n"); |
| 30 | + fprintf(stderr, "Note: Input file should be sorted\n\n"); |
| 31 | + return 1; |
| 32 | +} |
| 33 | + |
| 34 | +static void compute_print_cov(FILE* outputFile, Options userOpt, int* data, char* name,const uint32_t chrSize, int64_t* coverageHist,const int currentTid) |
| 35 | +{ |
| 36 | + int32_t covVal = 0; |
| 37 | + int64_t covSum = 0; |
| 38 | + int32_t i; |
| 39 | + |
| 40 | + //Go through chromosome and count avarage covarage. |
| 41 | + for (i=0; i<chrSize; ++i){ |
| 42 | + covVal += data[i]; |
| 43 | + //This will be sorted later. |
| 44 | + //If -m was not defined, this is useless, but cheaper than an 'if' |
| 45 | + data[i] = covVal; |
| 46 | + covSum += covVal; |
| 47 | + //Add value to histogram |
| 48 | + if (covVal > userOpt.maxCoverage) { |
| 49 | + ++coverageHist[userOpt.maxCoverage]; |
| 50 | + } else { |
| 51 | + ++coverageHist[covVal]; |
| 52 | + } |
| 53 | + |
| 54 | + } |
| 55 | + if (userOpt.doMedian) |
| 56 | + //Sort entireChr |
| 57 | + radix_sort(data, chrSize); |
| 58 | + |
| 59 | + //Printout avarage coverage over this chrom |
| 60 | + printf("Average coverage over %s : %3.2f\n", name, (double)covSum / chrSize); |
| 61 | + if (userOpt.doMedian) |
| 62 | + printf("Median coverage over %s : %d\n", name, data[chrSize/2]); |
| 63 | + if (userOpt.printAll == 1) { |
| 64 | + if (userOpt.doMedian) |
| 65 | + fprintf(outputFile, "%s\t%d\t%3.5f\t%d\n", name, chrSize, (double)covSum / chrSize, data[chrSize/2]); |
| 66 | + else |
| 67 | + fprintf(outputFile, "%s\t%d\t%3.5f\n", name, chrSize, (double)covSum / chrSize); |
| 68 | + } else if (currentTid < 24) { |
| 69 | + //Don't print alternate assemblies to the file |
| 70 | + //This is human genome specific |
| 71 | + if (userOpt.doMedian) |
| 72 | + fprintf(outputFile, "%s\t%d\t%3.5f\t%d\n", name, chrSize, (double)covSum / chrSize, data[chrSize/2]); |
| 73 | + else |
| 74 | + fprintf(outputFile, "%s\t%d\t%3.5f\n", name, chrSize, (double)covSum / chrSize); |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +/** |
| 79 | + * Main of app |
| 80 | + */ |
| 81 | +int main_qa(int argc, char *argv[]) |
| 82 | +{ |
| 83 | + samfile_t *fp; |
| 84 | + FILE *outputFile; |
| 85 | + Options userOpt; |
| 86 | + userOpt.printAll = 1; |
| 87 | + userOpt.doMedian = 0; |
| 88 | + userOpt.maxCoverage = 30; |
| 89 | + int arg; |
| 90 | + //Get args |
| 91 | + while ((arg = getopt(argc, argv, "amc:")) >= 0) { |
| 92 | + switch (arg) { |
| 93 | + case 'a': userOpt.printAll = 0; break; |
| 94 | + case 'm': userOpt.doMedian = 1; break; |
| 95 | + case 'c': userOpt.maxCoverage = atoi(optarg); break; |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + if (argc-optind != 2) { |
| 100 | + print_usage(); |
| 101 | + return 1; |
| 102 | + } |
| 103 | + |
| 104 | + //Note that file is supposed to have been ordered beforehand! |
| 105 | + if ((fp = samopen(argv[optind], "rb", 0)) == 0) { |
| 106 | + fprintf(stderr, "qaCompute: Fail to open BAM file %s\n", argv[1]); |
| 107 | + return 1; |
| 108 | + } |
| 109 | + if ((outputFile = fopen(argv[optind+1], "wt")) == 0) { |
| 110 | + fprintf(stderr, "qaCompute: Filed to create output file %s\n", argv[2]); |
| 111 | + return 1; |
| 112 | + } |
| 113 | + |
| 114 | + |
| 115 | + //Initialize bam entity |
| 116 | + bam1_t *b = bam_init1(); |
| 117 | + |
| 118 | + //All var declarations |
| 119 | + int64_t totalGenomeLength = 0; |
| 120 | + int32_t unmappedReads = 0; |
| 121 | + int32_t zeroQualityReads = 0; |
| 122 | + int32_t totalNumberOfReads = 0; |
| 123 | + int32_t totalProperPaires = 0; |
| 124 | + uint32_t chrSize = 0; |
| 125 | + |
| 126 | + int32_t duplicates = 0; |
| 127 | + |
| 128 | + int *entireChr = NULL; |
| 129 | + //Keep header for further reference |
| 130 | + bam_header_t* head = fp->header; |
| 131 | + |
| 132 | + int32_t currentTid = -1; |
| 133 | + |
| 134 | + //Create "map" vector for histogram |
| 135 | + int64_t* coverageHist = (int64_t*)malloc((userOpt.maxCoverage+1)*sizeof(int64_t)); |
| 136 | + memset( coverageHist, 0, (userOpt.maxCoverage+1)*sizeof(int64_t)); |
| 137 | + |
| 138 | + //Write file table header |
| 139 | + if (userOpt.doMedian == 1) |
| 140 | + fprintf(outputFile, "Chromosome\tSeq_len\tAvg_Cov\tMedian_Cov\n"); |
| 141 | + else |
| 142 | + fprintf(outputFile, "Chromosome\tSeq_lem\tAvg_Cov\n"); |
| 143 | + |
| 144 | + while (samread(fp, b) >= 0) { |
| 145 | + |
| 146 | + //uint32_t* cigar = bam1_cigar(b); |
| 147 | + |
| 148 | + //Get bam core. |
| 149 | + const bam1_core_t *core = &b->core; |
| 150 | + |
| 151 | + if (core == NULL) { |
| 152 | + //There is something wrong with the read/file |
| 153 | + printf("Input file is corrupt!"); |
| 154 | + //Leak everything and exit! |
| 155 | + return -1; |
| 156 | + } |
| 157 | + |
| 158 | + //BAM block has been read |
| 159 | + if (!is_mapped(core)) |
| 160 | + ++unmappedReads; |
| 161 | + else { |
| 162 | + |
| 163 | + if (core->tid != currentTid) { |
| 164 | + |
| 165 | + //Count coverage! |
| 166 | + if (currentTid != -1) { |
| 167 | + compute_print_cov(outputFile, userOpt, entireChr, head->target_name[currentTid], chrSize, coverageHist, currentTid); |
| 168 | + } |
| 169 | + |
| 170 | + //Get length of next section |
| 171 | + chrSize = head->target_len[core->tid]; |
| 172 | + totalGenomeLength += chrSize; |
| 173 | + printf("Computing %s of size %d... \n",head->target_name[core->tid],chrSize); |
| 174 | + |
| 175 | + //Done with current section. |
| 176 | + //Allocate memory |
| 177 | + entireChr = (int*)realloc(entireChr, (chrSize+1)*sizeof(int)); |
| 178 | + |
| 179 | + if (entireChr == NULL) { |
| 180 | + printf("Allocation failed! \n"); |
| 181 | + return -1; |
| 182 | + } |
| 183 | + memset(entireChr, 0, (chrSize+1)*sizeof(int)); |
| 184 | + |
| 185 | + currentTid = core->tid; |
| 186 | + |
| 187 | + } |
| 188 | + |
| 189 | + //If read has quality == 0, we won't count it as mapped |
| 190 | + if (core->qual != 0) { |
| 191 | + if (core->flag&BAM_FPROPER_PAIR) { |
| 192 | + //Is part of a proper pair |
| 193 | + ++totalProperPaires; |
| 194 | + } |
| 195 | + |
| 196 | + if (core->flag&BAM_FDUP) { |
| 197 | + //This is a duplicate. Don't count it!. |
| 198 | + ++duplicates; |
| 199 | + } else { |
| 200 | + //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details) |
| 201 | + ++entireChr[core->pos]; |
| 202 | + |
| 203 | + if (core->pos+core->l_qseq >= chrSize) |
| 204 | + --entireChr[chrSize-1]; |
| 205 | + else |
| 206 | + --entireChr[core->pos+core->l_qseq]; |
| 207 | + } |
| 208 | + |
| 209 | + } else { |
| 210 | + //Count is as unmapped? |
| 211 | + ++zeroQualityReads; |
| 212 | + } |
| 213 | + } |
| 214 | + |
| 215 | + ++totalNumberOfReads; |
| 216 | + |
| 217 | + } |
| 218 | + |
| 219 | + //Compute coverage for the last "chromosome" |
| 220 | + compute_print_cov(outputFile, userOpt, entireChr, head->target_name[currentTid], chrSize, coverageHist, currentTid); |
| 221 | + |
| 222 | + bam_destroy1(b); |
| 223 | + free(entireChr); |
| 224 | + |
| 225 | + printf("\n Duplicates:%d \n", duplicates); |
| 226 | + |
| 227 | + //Print header for next table in output file |
| 228 | + fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n"); |
| 229 | + |
| 230 | + printf("Total genome lenght %ld \n", totalGenomeLength); |
| 231 | + //Compute procentages of genome cover! |
| 232 | + int i = 0; |
| 233 | + for (; i <= userOpt.maxCoverage; ++i) { |
| 234 | + if (i == 0) { |
| 235 | + //Non-covered! |
| 236 | + printf("%3.2f of genome has not been covered\n", (double)(100*coverageHist[i])/totalGenomeLength); |
| 237 | + } else { |
| 238 | + int64_t coverage = 0; |
| 239 | + //All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition |
| 240 | + int x = i; |
| 241 | + for (; x <= userOpt.maxCoverage; ++x) |
| 242 | + coverage += coverageHist[x]; |
| 243 | + printf("%3.2f of genome has been covered at least %dX \n", (double)(100*coverage)/totalGenomeLength, i); |
| 244 | + fprintf(outputFile,"%d\t%3.5f\t%ld\n",i, (double)(100*coverage)/totalGenomeLength, coverageHist[i]); |
| 245 | + } |
| 246 | + } |
| 247 | + |
| 248 | + fprintf(outputFile,"\nOther\n"); |
| 249 | + |
| 250 | + //Printout procentage of mapped/unmapped reads |
| 251 | + double procentageOfUnmapped = (100*unmappedReads)/totalNumberOfReads; |
| 252 | + double procentageOfZeroQuality = (100*zeroQualityReads)/totalNumberOfReads; |
| 253 | + fprintf(outputFile,"Total number of reads: %d\n", totalNumberOfReads); |
| 254 | + fprintf(outputFile,"Total number of duplicates found and ignored: %d\n", duplicates); |
| 255 | + fprintf(outputFile,"Percentage of unmapped reads: %3.5f\n", procentageOfUnmapped); |
| 256 | + fprintf(outputFile,"Percentage of zero quality mappings: %3.5f\n", procentageOfZeroQuality); |
| 257 | + int32_t nrOfPaires = totalNumberOfReads/2; |
| 258 | + double procOfProperPaires = (double)(100*(double)totalProperPaires/2)/nrOfPaires; |
| 259 | + fprintf(outputFile,"Number of proper paired reads: %d\n", totalProperPaires); |
| 260 | + fprintf(outputFile,"Percentage of proper pairs: %3.5f\n", procOfProperPaires); |
| 261 | + |
| 262 | + printf("Out of %d reads, you have %3.5f unmapped reads\n and %3.5f zero quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality); |
| 263 | + |
| 264 | + |
| 265 | + free(coverageHist); |
| 266 | + |
| 267 | + |
| 268 | + samclose(fp); |
| 269 | + fclose(outputFile); |
| 270 | + return 0; |
| 271 | +} |
0 commit comments