plugins/formats/write/tar_writer.js

const { BaseWriter } = require('./base_writer');
const { Context } = require('../../../core/context');
const fs = require('fs');
const mkdirp = require('mkdirp');
const path = require('path');
const tar = require('tar-stream');

/**
 * TarWriter writes files directly into a tarball. By obviating the need
 * to copy files into a directory and then tar up the directory, this can
 * save considerable time and disk space when tarring large bags into a
 * tarred BagIt package.
 *
 */
class TarWriter extends BaseWriter {
    /**
     * Creates a new TarWriter.
     *
     * @param {string} pathToTarFile - The path to the tar file you want to
     * create. You must have write permissions on this path, and the parent
     * directories should already exist. If a file already exists at the
     * speficied path, it will be overwritten.
     *
     */
    constructor(pathToTarFile) {
        super('TarWriter', writeIntoArchive);
        /**
          * pathToTarFile is the path to the file we will write.
          * The file's parent directories should exist before writing, and you
          * must have write permissions on the file. If the file already
          * exists, it will be overwritten.
          *
          * @type {string}
          */
        this.pathToTarFile = pathToTarFile;
        /**
          * bagName is the name of the bag, which is the tar file name
          * minus the leading path and trailing ".tar" extension.
          *
          * @type {string}
          */
        this.bagName = path.basename(pathToTarFile, '.tar');
        /**
          * This is a special stream for serializing data into tar file format.
          *
          * @type {stream.Writable}
          * @private
          */
        this._tarPacker = null;
        /**
          * The stream used to write tar file contents onto disk.
          *
          * @type {stream.Writable}
          * @private
          */
        this._tarOutputWriter = null;
        /**
         * This keels track of which directory entries have been written into
         * the tarball.
         *
         * @type {Set}
         * @private
         */
        this._directoriesAdded = new Set();
    }

    /**
     * Returns a {@link PluginDefinition} object describing this plugin.
     *
     * @returns {PluginDefinition}
     */
    static description() {
        return {
            id: '90110710-1ff9-4650-a086-d7b23772238f',
            name: 'TarWriter',
            description: 'Built-in DART tar writer. Writes files directory into a tarball.',
            version: '0.1',
            readsFormats: [],
            writesFormats: ['.tar'],
            implementsProtocols: [],
            talksToRepository: [],
            setsUp: []
        };
    }

    /**
     * This ensures that the necessary directory entries are created
     * for each file added to the tarball. This is the equivalent of
     * mkdirp in the filesystem.
     *
     * @param {Object}
     * @private
     */
    _ensureDirectories(fileHeader) {
        let template =
        {
            relDestPath: '',
            mode: 0o755,
            uid: fileHeader.uid,
            gid: fileHeader.gid,
            mtime: null
        };
        let parts = fileHeader.name.split('/');
        let dir = '';
        for (let i=0; i < parts.length -1; i++) {
            dir = dir + parts[i] + '/';
            // We cached the stats for the directory in the
            // directories map. The key is relDestPath, minus
            // the bag name, with no traling slash. E.g. key for
            // "MyBag/data/subdir/" is "/data/subdir".
            var key = ""
            if (i > 0) {
                key = dir.split("/").slice(1).join("/").replace(/\/$/, "")
            }
            if (!this._directoriesAdded.has(dir)) {
                var data = this.directories[key]
                if (!data) {
                    data = Object.assign({}, template);
                    data.mtime = new Date();
                } 
                data.relDestPath = dir;
                this._mkdir(data);
            }
        }
    }

    /**
     * Writes a file into the tar archive. This method is asynchronous, emitting
     * events 'fileAdded' when it's done writing a file.
     *
     * Files will be written into the archive in the order they are added.
     * Because tar file contents must be written one at a type, this class
     * internally manages one-at-a-time write serialization.
     *
     * You'll get errors if bagItFile.absSourcePath does not exist or is not
     * readable.
     *
     * @param {BagItFile} bagItFile - The BagItFile object describing the file
     * to be added into the tar file.
     *
     * @param {Array<crypto.Hash>} cryptoHashes - An array of Node.js crypto.Hash
     * objects used to calculate checksums of the files being written into
     * the tarball. All digests are calculated during the write, so adding
     * multiple hashes will not lead to multiple end-to-end reads of the
     * input stream.
     *
     * You can omit this parameter if you don't care to calculate
     * checksums. If present, the digests will be written into the
     * bagItFile.checksums object. For example, if cryptoHashes includes md5
     * and sha256 Hash objects, bagItFile.checksums will come out looking
     * like this:
     *
     * @example
     * bagItFile.checksums = {
     *     'md5': '1234567890',
     *     'sha256': '0987654321'
     * }
     *
     */
    add(bagItFile, cryptoHashes = []) {
        super.add(bagItFile, cryptoHashes);
        var tarWriter = this;
        var header = {
            // Don't use path.join because Windows will give us
            // backslashes and tar file needs forward slashes.
            name: this.bagName + '/' + bagItFile.relDestPath,
            size: Number(bagItFile.size),
            mode: bagItFile.mode,
            uid: bagItFile.uid,
            gid: bagItFile.gid,
            mtime: bagItFile.mtime
        };
        // pax headers allow us to include files over 8GB in size
        header.pax = {
            size: Number(bagItFile.size)
        };

        tarWriter._ensureDirectories(header);

        /**
         * @event TarWriter#fileAdded - This event fires after a file
         * has been written into the underlying tar file.
         *
         * @type {BagItFile}
         *
         */
        let packer = null;
        try {
            packer = this._getTarPacker()
        } catch (err) {
            this._queue.error(err);
            return;
        }
        var data = {
            bagItFile: bagItFile,
            header: header,
            tar: packer,
            hashes: cryptoHashes,
            endFn: () => {
                tarWriter.onFileWritten();
                tarWriter.emit('fileAdded', bagItFile, tarWriter.percentComplete());
            },
            errFn: (err) => {
                tarWriter.emit('error', err);
            }
        };
        this._queue.push(data);
    }

    /**
     * This adds a directory entry to the tar file. It does not add any
     * contents to the directory. Use {@link add} for that.
     *
     * @param {BagItFile}
     * @private
     */
    _mkdir(bagItFile) {
        var tarWriter = this;
        var header = {
            name: bagItFile.relDestPath,
            type: 'directory',
            mode: Number(bagItFile.mode),
            uid: Number(bagItFile.uid),
            gid: Number(bagItFile.gid),
            mtime: bagItFile.mtime
        };

        if (!header.name.match(/\/$/)) {
            header.name += '/';
        }

        /**
         * @event TarWriter#directoryAdded - This event fires after a
         * directory entry has been written into the underlying tar file.
         *
         * @type {BagItFile}
         *
         */
        let packer = null;
        try {
            packer = this._getTarPacker()
        } catch (err) {
            this._queue.error(err);
            return;
        }
        var data = {
            bagItFile: bagItFile,
            header: header,
            tar: packer,
            hashes: [],
            endFn: () => {
                tarWriter.emit('directoryAdded', bagItFile);
            },
            errFn: (err) => {
                tarWriter.emit('error', err);
            }
        };
        this._queue.push(data);
        this._directoriesAdded.add(header.name);
    }


    /**
     * This returns the tar-stream packer object, creating it if it
     * doesn't already exist. The tar-stream packer transforms data to
     * tar format before the output writer writes it to disk.
     *
     * @returns {stream.Writable}
     * @private
     */
    _getTarPacker() {
        if (this._tarPacker == null) {
            this._tarPacker = tar.pack();
            this._tarPacker.pipe(this._getTarOutputWriter());
        }
        return this._tarPacker;
    }

    /**
     * This returns the stream that allows us to write our tar file to the
     * file system, creating the stream if it doesn't already exist.
     *
     * @returns {stream.Writable}
     * @private
     */
    _getTarOutputWriter() {
        if (this._tarOutputWriter == null) {
            if (!this.pathToTarFile.endsWith(".tar")) {
                throw new Error(Context.y18n.__(`pathToTarFile '%s' must have .tar extension`, this.pathToTarFile));
            }
            var dir = path.dirname(this.pathToTarFile);
            var stats = fs.statSync(dir);
            if (!stats.isDirectory()) {
                // This one came up while writing unit tests.
                // If path incudes /dev/null or other character devices,
                // the checks below will not handle the problem correctly.
                throw new Error(Context.y18n.__("Cannot write to output path '%s' because it is not a directory.", dir));
            }
            if (!fs.existsSync(dir)) {
                mkdirp.sync(dir, { mode: 0o755 });
            } else {
                // This throws an exception if the user can't write to dir.
                fs.accessSync(dir, fs.constants.W_OK);
            }

            var options = {
                mode: 0o644,
                autoClose: false
            };
            this._tarOutputWriter = fs.createWriteStream(this.pathToTarFile, options);
        }
        return this._tarOutputWriter;
    }
}

/**
 * This is the worker function for the TarWriter's one-at-a-time async queue.
 * This fuction writes data from a single file into the tarball, calculating
 * any necessary checksums along the way.
 *
 * @param {Object} data - An object containing information about what is
 * to be written into the archive.
 *
 * @param {function} done - A callback that indicates when the writer has
 * completed. The async library creates and manages this function.
 *
 * @private
 */
function writeIntoArchive(data, done) {
    if (data.header.type === 'directory') {
        writeDirectory(data, done);
    } else {
        writeFile(data, done);
    }
}

function writeFile(data, done) {
    try {
        var reader = fs.createReadStream(data.bagItFile.absSourcePath);

        reader.on('error', function(err) {
            data.errFn(err);
        });

        // For testing dashboard process management, slow down writes
        let writer;
        if (Context.slowMotionDelay > 0) {
            writer = data.tar.entry(data.header, () => setTimeout(done, Context.slowMotionDelay));
        } else {
            writer = data.tar.entry(data.header, done);
        }

        writer.on('error', function(err) {
            data.errFn(err);
        });
        writer.on('finish', data.endFn);

        reader.pause();
        for (var h of data.hashes) {
            reader.pipe(h)
        }
        reader.pipe(writer);
        reader.resume();
    } catch (err) {
        Context.logger.error(err);
        Context.logger.error(err.stack);
        data.errFn(err);
        done(err, data);
    }
}

function writeDirectory(data, done) {
    try {
        let writer = data.tar.entry(data.header, done);
        writer.on('finish', data.endFn);
    } catch (err) {
        Context.logger.error(err);
        data.errFn(err);
        done(err, data);
    }
}

module.exports = TarWriter;