Skip to content

Opening and destroying the same entry causes high memory usage #87

Closed
@noreng

Description

@noreng

I need to read the first row of a zipped csv file to create a table in a postgresql database, and then pg-copy the whole csv file to the table.

I open the same zip entry twice with zipFile.openReadStream. After I read the first chunk, I destroy the stream (with readStream.destroy()) and I open it again. Everything works as I expected, except the memoryUsage:
I have a ~1,6 GB csv file compressed to a 110 MB zip. When I open the entry for the first time, the max memory usage (RSS) is 120 MB. After re-opening the same entry, the RSS goes up high, from 120 MB to 1,7 GB. If I don't destroy the entry on the first run, and let it finish, the max RSS is 150 MB and it stays this low after I reopen the entry, so I don't think the problem is with re-opening the entry. There must be wrong how I destroy it.

My system: Windows 10, Node 10.

The excerpt of my code:

const yauzl = require('yauzl');
const fs = require('fs');
const promisify = require('es6-promisify');

(async function () {
    const fileBuffer = await promisify(fs.readFile)('./large.zip');

    console.log('started');
    const yauzlFromBuffer = promisify(yauzl.fromBuffer);
    const zipfile = await yauzlFromBuffer(fileBuffer, { lazyEntries: true });

    const openReadStream = promisify(zipfile.openReadStream.bind(zipfile));
    zipfile.readEntry();

    zipfile.on('entry', async (entry) => {
        console.log('1) read the entry');
        const stream = await openReadStream(entry);
        // read the first chunk only
        const justFirstChunk = true;
        await readEntry(stream, entry.uncompressedSize, justFirstChunk);

        // This is where I handle the first row

        console.log('2) read the same entry again');
        const stream2 = await openReadStream(entry);
        await readEntry(stream2, entry.uncompressedSize);
        zipfile.readEntry();
    });

    await new Promise(resolve => {
        zipfile.on('end', () => {
            console.log('finished');
            resolve();
        });
    });
}());

async function readEntry(readStream, entrySize, justFirstChunk) {
    logMemoryUsage();
    const onData = getProgressHandler(entrySize);
    await new Promise((resolve, reject) => {
        readStream
            .on('error', reject)
            .on('data', chunk => {
                // destroy the stream after the first chunk
                if (justFirstChunk) {
                    readStream.destroy();
                    console.log('readEntry destroyed');
                    logMemoryUsage();
                    resolve();
                } else {
                    onData(chunk);
                }
            })
            .on('end', () => {
                console.log('readEntry end');
                resolve();
            });
    });
}

function getProgressHandler(entrySize) {
    let bytes = 0;
    let step = 0.1;
    let nextProgressStep = bytes + step;
    return function (chunk) {
        bytes += chunk.length;
        const progress = Math.round(bytes / entrySize * 100) / 100;
        if (progress >= nextProgressStep) {
            console.log(progress);
            logMemoryUsage();
            nextProgressStep = ((nextProgressStep / step) + 1) * step;
        }
    };
}

function logMemoryUsage() {
    const memoryMB = Math.round(process.memoryUsage().rss / (1024 * 1024));
    console.log('memoryUsage:', memoryMB, 'MB');
}

Memory usage:

started                      
1) read the entry            
memoryUsage: 121 MB          
readEntry destroyed          
memoryUsage: 121 MB          
2) read the same entry again 
memoryUsage: 121 MB          
0.1                          
memoryUsage: 311 MB          
0.2                          
memoryUsage: 479 MB          
0.31                         
memoryUsage: 669 MB          
0.4                          
memoryUsage: 818 MB          
0.5                          
memoryUsage: 982 MB          
0.61                         
memoryUsage: 1157 MB         
0.71                         
memoryUsage: 1339 MB         
0.81                         
memoryUsage: 1506 MB         
0.91                         
memoryUsage: 1672 MB         
readEntry end                
finished                                              

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions