Skip to content

Commit 47dda8f

Browse files
add comments as reviewer suggested
1 parent 3ed3b78 commit 47dda8f

File tree

2 files changed

+6
-28
lines changed

2 files changed

+6
-28
lines changed

src/main/java/au/csiro/variantspark/utils/FileUtils.java

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,37 +20,11 @@ public static boolean isBGZFile(String filePath) {
2020
* .vcf is not GZP file and get htsjdk.samtools.SAMFormatException: at header from java.io.BufferedReader.readLine(BufferedReader.java:389)
2121
*/
2222
try (BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(filePath))) {
23-
//bufferedInputStream.mark(100); // mark the current position
2423
boolean isValid = BlockCompressedInputStream.isValidFile(bufferedInputStream);
25-
//bufferedInputStream.reset(); // reset back to the marked position
2624
return isValid;
2725
} catch (IOException e) {
28-
// Handle the exception
26+
//handle exception for non proper bgzip file
2927
return false;
3028
}
3129
}
32-
33-
/**
34-
*
35-
* @param file: an input file
36-
* @return true if input file is Gzip by check the first two byte of input file
37-
* @throws IOException
38-
*/
39-
public static boolean isInputGZip(final File file) throws IOException {
40-
//final PushbackInputStream pb = new PushbackInputStream(input, 2);
41-
42-
try(final InputStream input = new FileInputStream(file)){
43-
int header = input.read(); //read ID1
44-
if(header == -1) return false;
45-
46-
int b = input.read(); //read ID2
47-
if(b == -1) return false;
48-
49-
//ID2 * 256 + ID1 = 35615
50-
if( ( (b << 8) | header) == GZIPInputStream.GZIP_MAGIC)
51-
return true;
52-
}
53-
54-
return false;
55-
}
5630
}

src/main/scala/au/csiro/variantspark/cli/args/SparkArgs.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package au.csiro.variantspark.cli.args
22

33
import org.kohsuke.args4j.Option
44
import au.csiro.pbdava.ssparkle.spark.SparkApp
5-
import au.csiro.variantspark.utils._
5+
import au.csiro.variantspark.utils.FileUtils
66
import org.apache.spark.rdd.RDD
77
import htsjdk.samtools.util.BlockCompressedInputStream
88
import org.apache.hadoop.fs.Path
@@ -18,11 +18,15 @@ trait SparkArgs extends SparkApp {
1818
val isBGZ = FileUtils.isBGZFile(inputFile)
1919
println(inputFile + " is loading to spark RDD, isBGZFile: " + isBGZ)
2020
if (isBGZ) {
21+
// BGZIP file is compressed as blocks, requires specialized libraries htsjdk
2122
val path = new Path(inputFile)
2223
val fs = path.getFileSystem(sc.hadoopConfiguration)
2324
val bgzInputStream = new BlockCompressedInputStream(fs.open(path))
25+
// each blocks can be decompressed independently and to be read in parallel
2426
sc.parallelize(Stream.continually(bgzInputStream.readLine()).takeWhile(_ != null).toList)
2527
} else {
28+
// The standard GZIP libraries can handle files compressed as a whole
29+
// load .vcf, .vcf.gz or .vcf.bz2 to RDD
2630
sc.textFile(inputFile, if (sparkPar > 0) sparkPar else sc.defaultParallelism)
2731
}
2832
}

0 commit comments

Comments
 (0)