bagit/manifest_parser.js

const { Context } = require('../core/context');
const { KeyValueCollection } = require('./key_value_collection');
const { PassThrough } = require('stream');

const spaces = /\s+/;
const newline = "\n";

/**
 * ManifestParser parses text-based payload and tag manifests that
 * conform to the BagIt spec. These files have the following format:
 *
 * 924238a7018a567062c6527675fcf1c0  data/docs/index.html
 *
 * The first item on each line is a checksum and the second is the
 * relative path (within the bag) of the file it was calculated from.
 * The two items are separated by one or more spaces.
 *
 * This class has no methods. It simply responds to events on the stream
 * you pipe into it. After parsing the stream, it stores the data it
 * has parsed in bagItFile.keyValueCollection. Within that collection,
 * you can use the first() method to look up the checksum for a file.
 *
 * You can attach your own callback to the ManifestParser.stream end
 * event, if you want to do something with the BagItFile or (more likely)
 * its keyValueCollection when parsing completes.
 *
 * @param {BagItFile} bagItFile - A BagItFile object. If the object
 * does not already have a keyValueCollection, the parser will create
 * one.
 *
 * For more on the BagIt spec,
 * see {@link https://tools.ietf.org/html/draft-kunze-bagit-17|BagIt Spec}
 *
 * For info about how to read the parsed data from the file, see {@link KeyValueCollection}
 *
 * @example
 *
 * // Set up a BagItFile. Note the file itself is a payload manifest.
 * let pathToManifest = "/path/to/bag-info.txt";
 * let stats = fs.statSync(pathToManifest);
 * let bagItFile = new BagItFile(pathToManifest, "manifest-sha256.txt", stats);
 *
 * // Open the BagItFile for reading
 * let stream = fs.createReadStream(pathToManifest);
 *
 * // Create a new ManifestParser to parse the BagItFile
 * let manifestParser = new ManifestParser(bagItFile);
 *
 * // Optional: Hook up your callback to do something with
 * // bagItFile when the parsing is done.
 * manifestParser.stream.on('end', YOUR_CALLBACK_HERE);
 *
 * // Required: Pipe your file reader into the parser.
 * stream.pipe(manifestParser.stream).on('error', function(e){handleError(e)});
 *
 */
class ManifestParser {
    constructor(bagItFile) {
        /**
          * bagItFile is the file that will be parsed.
          * When parsing is complete, bagItFile.keyValueCollection
          * will be populated with relative file paths and
          * the checksum digests that correspond to those paths.
          *
          * @type {BagItFile}
          */
        this.bagItFile = bagItFile;

        /**
          * stream is a PassThrough stream that allows
          * for data to be piped from a ReadStream into
          * the parser. You can attach your own 'data' and
          * 'end' events to this stream, but the parser
          * already does the parsing work for you.
          *
          * @type {stream.PassThrough}
          */
        this.stream = new PassThrough();
        this.stream.setEncoding('utf8');

        /**
         * lastFragment is the last chunk of raw data from
         * the Readable stream.
         *
         * @private
         * @type {string}
         */
        this.lastFragment = '';

        var parser = this;
        if (bagItFile.keyValueCollection == null) {
            bagItFile.keyValueCollection = new KeyValueCollection();
        }

        // Handle line, lines, or data fragments piped in from reader.
        parser.stream.on('data', function(data) {
            var lines = data.split(newline);
            var lastIndex = lines.length - 1;
            for (var i=0; i < lines.length; i++) {
                var line = lines[i];
                if (i == 0) {
                    // First chunk of new data may begin in the middle of a line.
                    line = parser.lastFragment + line;
                    parser.lastFragment = '';
                } else if (i == lastIndex && !data.endsWith(newline)) {
                    // Last chunk of data is an incomplete line fragment
                    parser.lastFragment = line;
                    continue;
                }
                // First item on line is the fixity value.
                // Second item is file name, which may contain multiple spaces.
                if (i < lastIndex) {
                    var fixityValue = line.split(spaces, 1)[0];
                    var filename = line.replace(fixityValue, '').trim();
                    //Context.logger.debug(`"${filename}" = "${fixityValue}"`);
                    if (filename != '' && fixityValue != '') {
                        parser.bagItFile.keyValueCollection.add(filename, fixityValue);
                    }
                }
            }
        });

        // Handle end of stream
        parser.stream.on('end', function() {
            //Context.logger.debug(`Finished parsing manifest ${parser.bagItFile.absPath}`);
        });
    }
}

module.exports.ManifestParser = ManifestParser;