plugins/formats/read/tar_reader.js

const fs = require('fs');
const EventEmitter = require('events');
const { FileStat } = require('../../../util/file/filestat');
const { PassThrough } = require('stream');
const { Plugin } = require('../../plugin');
const tar = require('tar-stream');

/**
  * TarReader provides methods for listing and reading the contents
  * of tar files. This is used by the bag validator to validate tarred
  * bags without having to untar them first.
  *
  * Both TarReader and {@link FileSystemReader} implement a common
  * interface and emit a common set of events to provide the bag
  * validator with a uniform interface for reading bags packaged in
  * different formats.
  *
  * See the list() and read() functions below for information about
  * the events they emit.
 */
class TarReader extends Plugin {

    /**
      * Creates a new TarReader.
      *
      * @param {string} pathToTarFile - This should be the absolute
      * path to the tar file you want to read.
     */
    constructor(pathToTarFile) {
        super();
        /**
         * pathToTarFile is the absolute path to the tar file that
         * this iterator will read.
         *
         * @type {string}
         */
        this.pathToTarFile = pathToTarFile;
        /**
         * fileCount is the number of files encountered during a read()
         * or list() operation.
         *
         * @type {number}
         */
        this.fileCount = 0;
        /**
         * dirCount is the number of directories encountered during a
         * read() or list() operation.
         *
         * @type {number}
         */
        this.dirCount = 0;
        /**
         * byteCount keeps track of the total number of bytes in all
         * files beneath the speficied directory. This value is valid
         * only during list operations, not during read().
         *
         * @type {number}
         */
        this.byteCount = 0;
    }

    /**
     * Returns a {@link PluginDefinition} object describing this plugin.
     *
     * @returns {PluginDefinition}
     */
    static description() {
        return {
            id: 'd1045d20-153b-478c-aca4-7192c4ce624f',
            name: 'TarReader',
            description: 'Built-in DART tar reader',
            version: '0.1',
            readsFormats: ['.tar'],
            writesFormats: [],
            implementsProtocols: [],
            talksToRepository: [],
            setsUp: []
        };
    }

    /**
      * The read() method reads the contents of the tar file.
      * It emits the events "entry", "error" and "end".
      *
      * The read() method returns the same information as the
      * list() method, plus a readable stream from which you can
      * extract the contents of individual tar files.
      *
      * Note that read() will not advance to the next entry
      * until you've read the entire stream returned returned by
      * the "entry" event.
      *
      */
    read() {
        var tarReader = this;
        var extract = tar.extract();
        tarReader.fileCount = 0;
        tarReader.dirCount = 0;
        tarReader.byteCount = 0;

        extract.on('entry', function(header, stream, next) {

            var fileStat = tarReader._headerToFileStat(header);

            // relPath is the relative path of the file within
            // the tar file.
            var relPath = header.name;

            /**
             * @event TarReader#entry
             *
             * @description The entry event of the read() method includes both info
             * about the tar file (from the header) and a {@link ReadStream} object
             * that allows you to read the contents of the entry, if it's a file.
             * Note that you MUST read the stream to the end before TarReader.read()
             * will move to the next tar entry.
             *
             * @type {object}
             *
             * @property {string} relPath - The relative path (within the tar file)
             * of the entry.
             *
             * @property {ReadStream} stream - A stream from which you can read the
             * contents of the entry.
             *
             * @property {FileStat} fileStat - An object containing a subset info similar
             * to the fs.Stats object, describing the file's size and other attributes.
             */
            tarReader.emit('entry', { relPath: relPath, fileStat: fileStat, stream: stream });

            // When we reach the end of the read stream, tell the
            // tar-stream library to move on to the next entry.
            stream.on('end', function() {
                if (header.type === "file") {
                    tarReader.fileCount += 1;
                    tarReader.byteCount += Number(fileStat.size);
                } else if (header.type === "directory") {
                    tarReader.dirCount += 1;
                }
                next() // ready for next entry
            });
        });

        /**
         * @event TarReader#error
         *
         * @description Indicates something went wrong while reading the tarfile.
         *
         * @type {Error}
         */
        extract.on('error', function(err) {
            tarReader.emit('error', err);
        });

        /**
         * @event TarReader#end
         *
         * @description This indicates that the iterator has passed
         * the last entry in the tar file and there's nothing left to read.
         */
        extract.on('finish', function() {
            tarReader.emit('end', tarReader.fileCount + tarReader.dirCount);
        });

        // Open the tar file and start reading.
        fs.createReadStream(tarReader.pathToTarFile).pipe(extract)
    }

    /**
      * The list() method returns information about the contents
      * files and directories inside a tar file. Unlike read(), it does
      * not return a readable stream for any of the files it encounters.
      *
      * list() emits the events "entry", "error" and "end".
      *
      */
    list() {
        var tarReader = this;
        var extract = tar.extract();
        tarReader.fileCount = 0;
        tarReader.dirCount = 0;


        /**
         * @event TarReader#entry
         *
         * @description The entry event of the list() method returns info about
         * the tar entry, but no reader to read the contents of the entry.
         *
         * @type {object}
         *
         * @property {string} relPath - The relative path (within the tar file)
         * of the entry.
         *
         * @property {FileStat} fileStat - An object containing a subset info similar
         * to the fs.Stats object, describing the file's size and other attributes.
         *
         */
        extract.on('entry', function(header, stream, next) {
            var fileStat = tarReader._headerToFileStat(header);
            var relPath = header.name;
            tarReader.emit('entry', { relPath: relPath, fileStat: fileStat });
            stream.on('end', function() {
                if (header.type === "file") {
                    tarReader.fileCount += 1;
                    tarReader.byteCount += Number(fileStat.size);
                } else if (header.type === "directory") {
                    tarReader.dirCount += 1;
                }
                next();
            });

            // This works around
            // https://github.com/mafintosh/tar-stream/issues/64
            let pt = new PassThrough();
            stream.pipe(pt);
            pt.resume();
        });


        // Same as the error event documented above.
        extract.on('error', function(err) {
            tarReader.emit('error', err);
        });

        // Same as the end event documented above.
        extract.on('finish', function() {
            tarReader.emit('end', tarReader.fileCount + tarReader.dirCount);
        });

        // Open the tar file and start reading.
        fs.createReadStream(tarReader.pathToTarFile).pipe(extract)
    }

    _headerToFileStat(header) {
        return new FileStat({
            size: header.size,
            mode: header.mode,
            uid: header.uid,
            gid: header.gid,
            mtimeMs: header.mtime,
            type: header.type
        });
    }
}

module.exports = TarReader;