Skip to content

Commit 7539ef0

Browse files
committed
HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..). Contributed by mahadev
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@915168 13f79535-47bb-0310-9956-ffa450edef68
1 parent bde2103 commit 7539ef0

File tree

2 files changed

+68
-23
lines changed

2 files changed

+68
-23
lines changed

CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ Trunk (unreleased changes)
163163

164164
OPTIMIZATIONS
165165

166+
HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).
167+
(mahadev via szetszwo)
168+
166169
BUG FIXES
167170

168171
HADOOP-6293. Fix FsShell -text to work on filesystems other than the

src/java/org/apache/hadoop/fs/HarFileSystem.java

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -325,25 +325,12 @@ public Path makeQualified(Path path) {
325325
@Override
326326
public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
327327
long len) throws IOException {
328-
// need to look up the file in the underlying fs
329-
// look up the index
330-
331-
// make sure this is a prt of this har filesystem
332-
Path p = makeQualified(file.getPath());
333-
Path harPath = getPathInHar(p);
334-
String line = fileStatusInIndex(harPath);
335-
if (line == null) {
336-
throw new FileNotFoundException("File " + file.getPath() + " not found");
337-
}
338-
HarStatus harStatus = new HarStatus(line);
339-
if (harStatus.isDir()) {
340-
return new BlockLocation[0];
341-
}
342-
FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
343-
harStatus.getPartName()));
344-
BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile,
345-
harStatus.getStartIndex() + start, len);
346-
return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
328+
// just fake block locations
329+
// its fast and simpler
330+
// doing various block location manipulation
331+
// with part files adds a lot of overhead because
332+
// of the look ups of filestatus in index files
333+
return new BlockLocation[]{ new BlockLocation() };
347334
}
348335

349336
/**
@@ -387,6 +374,63 @@ public Store(long begin, long end, int startHash, int endHash) {
387374
public int endHash;
388375
}
389376

377+
/**
378+
* Get filestatuses of all the children of a given directory. This just reads
379+
* through index file and reads line by line to get all statuses for children
380+
* of a directory. Its a brute force way of getting all such filestatuses
381+
*
382+
* @param parent
383+
* the parent path directory
384+
* @param statuses
385+
* the list to add the children filestatuses to
386+
* @param children
387+
* the string list of children for this parent
388+
* @param archiveIndexStat
389+
* the archive index filestatus
390+
*/
391+
private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
392+
List<String> children, FileStatus archiveIndexStat) throws IOException {
393+
// read the index file
394+
FSDataInputStream aIn = null;
395+
try {
396+
aIn = fs.open(archiveIndex);
397+
LineReader aLin;
398+
long read = 0;
399+
aLin = new LineReader(aIn, getConf());
400+
String parentString = parent.getName();
401+
Path harPath = new Path(parentString);
402+
int harlen = harPath.depth();
403+
Text line = new Text();
404+
while (read < archiveIndexStat.getLen()) {
405+
int tmp = aLin.readLine(line);
406+
read += tmp;
407+
String lineFeed = line.toString();
408+
String child = lineFeed.substring(0, lineFeed.indexOf(" "));
409+
if ((child.startsWith(parentString))) {
410+
Path thisPath = new Path(child);
411+
if (thisPath.depth() == harlen + 1) {
412+
// bingo!
413+
HarStatus hstatus = new HarStatus(lineFeed);
414+
FileStatus childStatus = new FileStatus(hstatus.isDir() ? 0
415+
: hstatus.getLength(), hstatus.isDir(), (int) archiveIndexStat
416+
.getReplication(), archiveIndexStat.getBlockSize(),
417+
archiveIndexStat.getModificationTime(), archiveIndexStat
418+
.getAccessTime(), new FsPermission(archiveIndexStat
419+
.getPermission()), archiveIndexStat.getOwner(),
420+
archiveIndexStat.getGroup(), makeRelative(this.uri.toString(),
421+
new Path(hstatus.name)));
422+
statuses.add(childStatus);
423+
}
424+
line.clear();
425+
}
426+
}
427+
} finally {
428+
if (aIn != null) {
429+
aIn.close();
430+
}
431+
}
432+
}
433+
390434
// make sure that this harPath is relative to the har filesystem
391435
// this only works for relative paths. This returns the line matching
392436
// the file in the index. Returns a null if there is not matching
@@ -650,10 +694,8 @@ public FileStatus[] listStatus(Path f) throws IOException {
650694
archiveStatus.getOwner(), archiveStatus.getGroup(),
651695
makeRelative(this.uri.toString(), new Path(hstatus.name))));
652696
else
653-
for (String child: hstatus.children) {
654-
FileStatus tmp = getFileStatus(new Path(tmpPath, child));
655-
statuses.add(tmp);
656-
}
697+
fileStatusesInIndex(hstatus, statuses, hstatus.children, archiveStatus);
698+
657699
return statuses.toArray(new FileStatus[statuses.size()]);
658700
}
659701

0 commit comments

Comments
 (0)