bagit/bagger.js

const { BagItFile } = require('./bagit_file');
const { Constants } = require('../core/constants');
const { Context } = require('../core/context');
const dateFormat = require('dateformat');
const EventEmitter = require('events');
const fs = require('fs');
const { KeyValueCollection } = require('./key_value_collection');
const mkdirp = require('mkdirp');
const { OperationResult } = require('../core/operation_result');
const os = require('os');
const path = require('path');
const { PluginManager } = require('../plugins/plugin_manager');
const { TagDefinition } = require('./tag_definition');
const { Util } = require('../core/util');

/**
 * Bagger creates a bag based on a BagItProfile.
 *
 * @param {Job} job - A job object that includes a
 * {@link PackageOperation} describing a number of files to be
 * packaged and a {@link BagItProfile} describing how to package them.
 *
 * Since bagging is basically a streaming operation, streaming data
 * into a specified format, this class implements a subset of the
 * Node.js stream events. The 'error' and 'finish' events are the
 * primary ones to listen to.
 *
 *
 * @example
 * // Assuming you have already created a Job object
 * var bagger = new Bagger(job);
 * bagger.on('error', function(err) {
 *    // Check the contents of job.packageOperation.result.errors
 *    // for details of what went wrong.
 * });
 * bagger.on('fileAdded', function(bagItFile, percentComplete) {
 *    // Do something with the BagItFile, such as displaying
 *    // a message saying it's been written into the bag.
 *    // Don't alter the bagItFile object since it's still
 *    // in use by the bagger. percentComplete is a number
 *    // between 0 and 100 indicating what percentage of the
 *    // total write job is complete.
 * });
 * bagger.on('finish', function() {
 *     // Do whatever you want when the bag is complete.
 *     // If needed, you can inspect the contents of the bagger.bagItFiles
 *     // array. Manifests and tag files in the bagItFiles array
 *     // will include the file contents. Payload files will not.
 * });
 * bagger.create();
 *
 */
class Bagger extends EventEmitter {
    constructor(job) {
        super();
        /**
         * The Job object contains information about what the bagger
         * is supposed to bag, and according to what profile.
         *
         * @type {Job}
         */
        this.job = job;
        /**
         * This is a list of absolute paths to temporary tag files and
         * manifests. These go into the system temp directory during
         * bagging, and the bagger deletes them when it's done.
         *
         * @type {Array<string>}
         */
        this.tmpFiles = [];
        /**
         * This is a list of {@link BagItFile} objects that were packed
         * into the bag. This includes payload files, manifests, tag files
         * and tag manifests.
         *
         * @type {BagItFile}
         */
        this.bagItFiles = [];
        /**
         * The formatWriter is a plugin used to write the bag onto disk.
         * For example, a bag being written into a directory on the file
         * system will use the FileSystemWriter plugin. A bag being written
         * to a tar file will use the TarWriter plugin, etc.
         *
         * The bagger chooses the formatWriter at runtime, based on
         * heuristics such as the file extension of the output file.
         *
         * @type {object}
         */
        this.formatWriter = null;

        // private
        this._pathToTrim = null;
    }

    /**
     * This ensures the packaging operation is valid before the bagger
     * tries to run it.
     *
     * @returns {boolean} - True or false, indicating whether or not
     * the job is valid.
     */
    validatePackagingOperation() {
        this.errors = {};
        var packOp = this.job.packageOp;
        if (!packOp.validate()) {
            packOp.result.errors.push("Job is not valid.");
            for(var [key, value] of Object.entries(this.job.errors)) {
                packOp.result.errors.push(`${key}: ${value}`);
            }
            packOp.result.finish();
            return false;
        }
        return true;
    }

    /**
     * This creates the bag based in the BagItProfile and other info specified
     * in the {@link Job} object. See the documentation for the {@link Bagger}
     * class for an example of how to use this method.
     *
     */
    async create() {
        var packOp = this.job.packageOp;
        this.emit('packageStart', `Starting to build ${packOp.packageName}`);
        packOp.result = new OperationResult('bagging', 'DART bagger');
        packOp.result.filepath = packOp.outputPath;
        packOp.result.start();

        if (!this.validatePackagingOperation()) {
            this._finish();
            this.emit('error', Context.y18n.__('Validation error in packaging operation.'));
            return false;
        }

        try {
            this._initWriter();
        } catch (ex) {
            packOp.result.errors.push(ex.toString());
            this._finish();
            this.emit('error', ex.toString());
            return false;
        }

        var bagger = this;
        /**
         * @event Bagger#error
         *
         * @description Emits a string describing an error encountered
         * during the bagging process. Processing may continue after
         * some types of errors.
         *
         * @type {string}
         */
        this.formatWriter.on('error', function(err) {
            packOp.result.errors.push(err);
            packOp.result.finish();
            bagger.emit('error', err);
        });
        /**
         * @event Bagger#fileAdded
         *
         * @description Emits a {@link BagItFile} object describing a
         * file that was just written into the bag.
         *
         * @type {BagItFile}
         */
        this.formatWriter.on('fileAdded', function(bagItFile, percentComplete) {
            bagger.emit('fileAdded', bagItFile, percentComplete);
        });

        await this._addPayloadFiles();

        if (!packOp.result.hasErrors()) {
            await this._addTagFiles();
        }
        if (!packOp.result.hasErrors()) {
            await this._addManifests();
        }
        if (!packOp.result.hasErrors()) {
            await this._addTagManifests();
        }

        bagger._finish();
    }

    /**
     * This adds payload files to the bag.
     */
    async _addPayloadFiles() {
        var packOp = this.job.packageOp;
        for (var absPath of packOp.sourceFiles) {
            var relDestPath = this._getRelDestPath(absPath);
            var stats = fs.statSync(absPath);
            if (stats.isFile()) {
                await this._addFile(absPath, relDestPath, stats);
            } else if (stats.isDirectory()) {
                // Wait until entire directory is added before
                // attaching finish listener, else queue will
                // drain more than once.
                this.formatWriter.directories[relDestPath] = stats;
                await this._addDirectory(absPath, relDestPath, stats);
            }
        }
        var bagger = this;
        return new Promise( function(resolve, reject) {
            bagger.formatWriter.once('finish', function() {
                resolve();
            });
        });
    }

    /**
     * Adds a tag file to the bag, writing out all of the tag
     * name-value pairs.
     *
     * @private
     */
    async _addTagFiles(bagItFiles) {
        this._setBagInfoAutoValues();
        var profile = this.job.bagItProfile;
        for (let tagFileName of profile.tagFileNames()) {
            let content = profile.getTagFileContents(tagFileName);
            let tmpFile = path.join(os.tmpdir(), tagFileName + Date.now());
            this.tmpFiles.push(tmpFile);
            if (!fs.existsSync(path.dirname(tmpFile))) {
                mkdirp.sync(path.dirname(tmpFile), { mode: 0o755 });
            }
            fs.writeFileSync(tmpFile, content);
            var stats = fs.statSync(tmpFile);
            await this._addFile(tmpFile, tagFileName, stats);
        }
        let bagger = this;
        return new Promise(function(resolve, reject) {
            bagger.formatWriter.once('finish', function() {
                resolve();
            });
        });
    }

    /**
     * Adds payload manifests to the bag.
     *
     * @private
     */
    async _addManifests() {
        let bagger = this;
        let promise = new Promise(function(resolve, reject) {
            bagger.formatWriter.once('finish', function() {
                resolve();
            });
        });
        await this._writeManifests('payload');
        return promise;
    }

    /**
     * Adds tag manifests to the bag.
     *
     * @private
     */
    async _addTagManifests() {
        let bagger = this;
        let promise = new Promise(function(resolve, reject) {
            bagger.formatWriter.once('finish', function() {
                resolve();
            });
        });
        await bagger._writeManifests('tag');
        return promise;
    }


    /**
     * Adds an entire directory to the bag's payload.
     *
     * @private
     */
    _addDirectory(absPath, relDestPath, stats) {
        let bagger = this;
        let packOp = this.job.packageOp;
        let fsReaderClass = PluginManager.findById(Constants.FILESYSTEM_READER_UUID);
        let fsReader = new fsReaderClass(absPath);
        fsReader.on('entry', function(entry) {
            let fullPath = path.join(absPath, entry.relPath);
            let relDestPath = bagger._getRelDestPath(fullPath);
            if (entry.fileStat.isFile()) {
                bagger._addFile(fullPath, relDestPath, entry.fileStat);
            } else if (entry.fileStat.isDirectory()) {
                bagger.formatWriter.directories[relDestPath] = entry.fileStat;
            }
        });
        fsReader.on('error', function(err) {
            packOp.result.errors.push(err.toString());
            bagger.emit('error', err);
        });
        fsReader.list();
        return new Promise(function(resolve, reject) {
            fsReader.on('end', function(fileCount) {
                resolve(fileCount);
            });
        });
    }

    /**
     * Adds a single file to the bag's payload.
     *
     * @private
     */
    _addFile(absPath, relDestPath, stats) {
        if (os.platform() === 'win32' && this.formatWriter.constructor.name === 'TarWriter') {
            relDestPath = relDestPath.replace(/\\/g, '/');
        }
        var profile = this.job.bagItProfile;
        let bagItFile = new BagItFile(absPath, relDestPath, stats);

        let manifestAlgs = profile.chooseManifestAlgorithms('manifest');
        if (!relDestPath.startsWith('data/')) {
            // This is a tag file, not a payload file.
            manifestAlgs = profile.chooseManifestAlgorithms('tagmanifest');
        }
        let cryptoHashes = this._getCryptoHashes(bagItFile, manifestAlgs);
        this.formatWriter.add(bagItFile, cryptoHashes);
        this.bagItFiles.push(bagItFile);
        return new Promise(function(resolve) {
            resolve(bagItFile);
        });
    }

    /**
     * This chooses the plugin that will be used when writing the bag
     * to disk. Tarred bags will use the TarWriter plugin, unserialized
     * bags will use the FileSystemWriter plugin, etc.
     *
     * @private
     */
    _initWriter() {
        if (this.formatWriter) {
            // Don't create another because it will overwrite our output file.
            return;
        }
        var outputPath = this.job.packageOp.outputPath;
        var parentDir = path.dirname(outputPath);
        let fileExtension = path.extname(outputPath);
        if (fileExtension === '') {
            fileExtension = 'directory';
            parentDir = outputPath;
        }
        if (!fs.existsSync(outputPath)) {
            mkdirp.sync(parentDir, { mode: 0o755 });
        }
        var plugins = PluginManager.canWrite(fileExtension);
        if (!plugins || plugins.length == 0) {
            throw Context.y18n.__("DART cannot find a plugin that knows how to write %s files.", fileExtension);
        }
        // plugins[0] is a writer plugin (a class) with a constructor
        // that takes pathToBag as its sole param.
        this.formatWriter = new plugins[0](outputPath);
        this.formatWriter.init();
    }

    /**
     * Given a file's path on disk this returns the relative path that
     * file will occupy inside the bag.
     *
     * @param {string} absPath - The path of the source file on disk.
     *
     * @returns {string} - The path the file will occupy inside the bag.
     *
     * @private
     */
    _getRelDestPath(absPath) {
        var trimmedPath = this._trimAbsPath(absPath);
        var relDestPath = 'data' + trimmedPath;
        if (os.platform() == 'win32') {
            relDestPath = 'data' + Util.normalizeWindowsPath(trimmedPath);
        }
        return relDestPath;
    }

    /**
     * Trim common path prefixes. We call this before calculating the
     * relDestPath. See {@link Util.findCommonPathPrefix} and
     * {@link PackageOperation.trimLeadingPaths}.
     *
     * @private
     */
    _trimAbsPath(absPath) {
        let trimPath = this._getTrimPath();
        let trimmed = absPath;
        if (trimPath) {
            if (os.platform() == 'win32') {
                trimPath = trimPath.replace(/\\/g, '\\\\');
            }
            let pattern = new RegExp('^' + trimPath);
            trimmed = path.sep + absPath.replace(pattern, '');
        }
        return trimmed;
    }

    /**
     * Returns the common leading path that we can trim from source
     * files before bagging. See {@link Util.findCommonPathPrefix} and
     * {@link PackageOperation.trimLeadingPaths}.
     *
     * @private
     */
    _getTrimPath() {
        if (this._pathToTrim === null) {
            if (this.job.packageOp.trimLeadingPaths()) {
                this._pathToTrim = Util.findCommonPathPrefix(this.job.packageOp.sourceFiles);
            } else {
                this._pathToTrim = '';
            }
        }
        return this._pathToTrim;
    }

    /**
     * Sets some automatic values in the bag-info.txt file, including
     * Bagging-Date, Bagging-Software, Payload-Oxum and
     * BagItProfileIdentifier
     *
     * @private
     */
    _setBagInfoAutoValues() {
        var profile = this.job.bagItProfile;
        var baggingDate = profile.firstMatchingTag('tagName', 'Bagging-Date');
        if (baggingDate) {
            baggingDate.userValue = dateFormat(Date.now(), 'isoUtcDateTime');
        }

        var baggingSoftware = profile.firstMatchingTag('tagName', 'Bagging-Software');
        if (baggingSoftware) {
            baggingSoftware.userValue = Context.dartVersion();
        }

        // This is an odd one, because the bagit-profiles spec at
        // https://bagit-profiles.github.io/bagit-profiles-specification/
        // says it's required, and yet the example specs do not require
        // or even this tag.
        var profileIdentifier = profile.firstMatchingTag('tagName', 'BagIt-Profile-Identifier')
        if (profileIdentifier == null) {
            profileIdentifier = new TagDefinition({
                id: Util.uuid4(),
                tagFile: "bag-info.txt",
                tagName: "BagIt-Profile-Identifier",
                required: true
            });
            profile.tags.push(profileIdentifier)
        }
        profileIdentifier.userValue = profile.bagItProfileInfo.bagItProfileIdentifier || 'https://example.com/profile.json';

        var fileCount = 0;
        var byteCount = 0;
        var payloadOxum = profile.firstMatchingTag('tagName', 'Payload-Oxum');
        if (payloadOxum) {
            for (let f of this.bagItFiles) {
                if (f.isPayloadFile()) {
                    fileCount += 1;
                    byteCount += Number(f.size);
                }
            }
            payloadOxum.userValue = `${byteCount}.${fileCount}`;
        }

        var bagSize = profile.firstMatchingTag('tagName', 'Bag-Size');
        if (bagSize && payloadOxum) {
            bagSize.userValue = Util.toHumanSize(byteCount);
        }
    }

    /**
     * Deletes temporary manifest and tag files that were generated during
     * the bagging process.
     *
     * @private
     */
    _deleteTempFiles() {
        for (let f of this.tmpFiles) {
            if (fs.existsSync(f)) {
                fs.unlinkSync(f);
            }
        }
    }

    /**
     * Records results of the bagging operation, cleans up temp files,
     * and emits the 'finish' event.
     *
     * @private
     */
    _finish() {
        var result = this.job.packageOp.result;
        result.finish();
        if (fs.existsSync(result.filepath)) {
            let stat = fs.statSync(result.filepath);
            if (stat.isDirectory()) {
                const sum = (total, file) => total + Number(file.size);
                result.filesize = this.bagItFiles.reduce(sum, 0);
            } else {
                result.filesize = Number(stat.size);
            }
        }
        this._deleteTempFiles();
        /**
         * @event Bagger#finish
         *
         * @description Emits an empty event indicating the bagger has
         * completed its work. Check bagger.job.packageOp.result
         * for errors.
         *
         */
        this.emit('finish');
    }


    /**
     * Adds manifests of the specified type to the bag.
     *
     * @param {string} payloadOrTag - Describes whether to add payload
     * or tag manifests.
     *
     * @private
     */
    async _writeManifests(payloadOrTag) {
        var profile = this.job.bagItProfile;
        var manifestAlgs = profile.chooseManifestAlgorithms('manifest');
        var fileNamePrefix = 'manifest';
        if (payloadOrTag == 'tag') {
            manifestAlgs = profile.chooseManifestAlgorithms('tagmanifest');
            fileNamePrefix = 'tagmanifest';
        }
        if (manifestAlgs.length == 0) {
            this.formatWriter.emit('finish');
            return;
        }
        for (let algorithm of manifestAlgs) {
            var manifestName = `${fileNamePrefix}-${algorithm}.txt`;
            let tmpFile = path.join(os.tmpdir(), manifestName + Date.now());
            this.tmpFiles.push(tmpFile);
            var fd = fs.openSync(tmpFile, 'w')
            for (let bagItFile of this.bagItFiles) {
                if (payloadOrTag === 'payload' && !bagItFile.isPayloadFile()) {
                    continue;
                }
                if (payloadOrTag === 'tag' && (bagItFile.isPayloadFile() || bagItFile.isTagManifest())) {
                    continue;
                }
                let digest = bagItFile.checksums[algorithm];
                fs.writeSync(fd, `${digest} ${bagItFile.relDestPath}\n`);
            }
            fs.closeSync(fd);
            var stats = fs.statSync(tmpFile);
            await this._addFile(tmpFile, manifestName, stats);
        }
    }

    /**
     * Returns a list of cryptographic hash objects that must be calculated
     * on a file as it's written into the bag. DART can calculate multiple
     * digests on a file during a single write.
     *
     * @param {BagItFile} bagItFile - The BagItFile on which to calculate
     * the hash digest.
     *
     * @param {Array<string>} algorithms - The names of the algorithms to
     * calculate. For example, ['md5', 'sha256', 'sha512']. This info comes
     * from the manifestsRequired and tagManifestsRequired properties of the
     * BagItProfile.
     *
     * @returns {Array<object>} - An array of Node.js crypto.Hash objects.
     *
     * @private
     */
    _getCryptoHashes(bagItFile, algorithms) {
        let hashes = [];
        for (let algorithm of algorithms) {
            hashes.push(bagItFile.getCryptoHash(algorithm));
        }
        return hashes;
    }
}

module.exports.Bagger = Bagger;