Record AAC sample offsets

The generated fragmented mp4 for the audio track can now be appended to a source buffer in Chrome 43 without errors but the buffered duration remains zero after the operation completes. All current tests pass. Sample table entries and mdat contents look consistent with an audio-only, fragmented mp4 generated by ffmpeg though some discrepancies exists in track metadata still.

Record AAC sample offsets
The generated fragmented mp4 for the audio track can now be appended to a source buffer in Chrome 43 without errors but the buffered duration remains zero after the operation completes. All current tests pass. Sample table entries and mdat contents look consistent with an audio-only, fragmented mp4 generated by ffmpeg though some discrepancies exists in track metadata still.
David LaPalomento
Showing 4 changed files with 200 additions and 70 deletions
src/mp4-generator.js
src/transmuxer.js
test/mp4-generator_test.js
test/muxer/js/mp4-inspector.js
--- a/src/mp4-generator.js
View file @72a6a13
+++ b/src/mp4-generator.js
View file @72a6a13
@@ -333,17 +333,18 @@ sdtp = function(track) {
  var
    samples = track.samples || [],
    bytes = new Uint8Array(4 + samples.length),
-    sample,
+    flags,
    i;

  // leave the full box header (4 bytes) all zero

  // write the sample table
  for (i = 0; i < samples.length; i++) {
-    sample = samples[i];
-    bytes[i + 4] = (sample.flags.dependsOn << 4) |
-      (sample.flags.isDependedOn << 2) |
-      (sample.flags.hasRedundancy);
+    flags = samples[i].flags;
+
+    bytes[i + 4] = (flags.dependsOn << 4) |
+      (flags.isDependedOn << 2) |
+      (flags.hasRedundancy);
  }

  return box(types.sdtp,
@@ -508,30 +509,58 @@ tkhd = function(track) {
  return box(types.tkhd, result);
 };

+/**
+ * Generate a track fragment (traf) box. A traf box collects metadata
+ * about tracks in a movie fragment (moof) box.
+ */
 traf = function(track) {
-  var sampleDependencyTable = sdtp(track);
+  var trackFragmentHeader, trackFragmentDecodeTime,
+      trackFragmentRun, sampleDependencyTable, dataOffset;
+
+  trackFragmentHeader = box(types.tfhd, new Uint8Array([
+    0x00, // version 0
+    0x00, 0x00, 0x00, // flags
+    (track.id & 0xFF000000) >> 24,
+    (track.id & 0xFF0000) >> 16,
+    (track.id & 0xFF00) >> 8,
+    (track.id & 0xFF) // track_ID
+  ]));
+
+  trackFragmentDecodeTime = box(types.tfdt, new Uint8Array([
+    0x00, // version 0
+    0x00, 0x00, 0x00, // flags
+    0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime
+  ]));
+
+  // the data offset specifies the number of bytes from the start of
+  // the containing moof to the first payload byte of the associated
+  // mdat
+  dataOffset = (16 + // tfhd
+                16 + // tfdt
+                8 +  // traf header
+                16 + // mfhd
+                8 +  // moof header
+                8);  // mdat header
+
+  // audio tracks require less metadata
+  if (track.type === 'audio') {
+    trackFragmentRun = trun(track, dataOffset);
+    return box(types.traf,
+               trackFragmentHeader,
+               trackFragmentDecodeTime,
+               trackFragmentRun);
+  }
+
+  // video tracks should contain an independent and disposable samples
+  // box (sdtp)
+  // generate one and adjust offsets to match
+  sampleDependencyTable = sdtp(track);
+  trackFragmentRun = trun(track,
+                          sampleDependencyTable.length + dataOffset);
  return box(types.traf,
-             box(types.tfhd, new Uint8Array([
-               0x00, // version 0
-               0x00, 0x00, 0x00, // flags
-               (track.id & 0xFF000000) >> 24,
-               (track.id & 0xFF0000) >> 16,
-               (track.id & 0xFF00) >> 8,
-               (track.id & 0xFF) // track_ID
-             ])),
-             box(types.tfdt, new Uint8Array([
-               0x00, // version 0
-               0x00, 0x00, 0x00, // flags
-               0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime
-             ])),
-             trun(track,
-                  sampleDependencyTable.length +
-                  16 + // tfhd
-                  16 + // tfdt
-                  8 +  // traf header
-                  16 + // mfhd
-                  8 +  // moof header
-                  8),  // mdat header
+             trackFragmentHeader,
+             trackFragmentDecodeTime,
+             trackFragmentRun,
             sampleDependencyTable);
 };

@@ -571,51 +600,116 @@ trex = function(track) {
  return box(types.trex, result);
 };

-trun = function(track, offset) {
-  var bytes, samples, sample, i;
+(function() {
+  var audioTrun, videoTrun, trunHeader;
+
+  // This method assumes all samples are uniform. That is, if a
+  // duration is present for the first sample, it will be present for
+  // all subsequent samples.
+  // see ISO/IEC 14496-12:2012, Section 8.8.8.1
+  trunHeader = function(samples, offset) {
+    var durationPresent = 0, sizePresent = 0,
+        flagsPresent = 0, compositionTimeOffset = 0;
+
+    // trun flag constants
+    if (samples.length) {
+      if (samples[0].duration !== undefined) {
+        durationPresent = 0x1;
+      }
+      if (samples[0].size !== undefined) {
+        sizePresent = 0x2;
+      }
+      if (samples[0].flags !== undefined) {
+        flagsPresent = 0x4;
+      }
+      if (samples[0].compositionTimeOffset !== undefined) {
+        compositionTimeOffset = 0x8;
+      }
+    }

-  samples = track.samples || [];
-  offset += 8 + 12 + (16 * samples.length);
+    return [
+      0x00, // version 0
+      0x00,
+      durationPresent | sizePresent | flagsPresent | compositionTimeOffset,
+      0x01, // flags
+      (samples.length & 0xFF000000) >>> 24,
+      (samples.length & 0xFF0000) >>> 16,
+      (samples.length & 0xFF00) >>> 8,
+      samples.length & 0xFF, // sample_count
+      (offset & 0xFF000000) >>> 24,
+      (offset & 0xFF0000) >>> 16,
+      (offset & 0xFF00) >>> 8,
+      offset & 0xFF // data_offset
+    ];
+  };

-  bytes = [
-    0x00, // version 0
-    0x00, 0x0f, 0x01, // flags
-    (samples.length & 0xFF000000) >>> 24,
-    (samples.length & 0xFF0000) >>> 16,
-    (samples.length & 0xFF00) >>> 8,
-    samples.length & 0xFF, // sample_count
-    (offset & 0xFF000000) >>> 24,
-    (offset & 0xFF0000) >>> 16,
-    (offset & 0xFF00) >>> 8,
-    offset & 0xFF // data_offset
-  ];
+  videoTrun = function(track, offset) {
+    var bytes, samples, sample, i;
+
+    samples = track.samples || [];
+    offset += 8 + 12 + (16 * samples.length);
+
+    bytes = trunHeader(samples, offset);
+
+    for (i = 0; i < samples.length; i++) {
+      sample = samples[i];
+      bytes = bytes.concat([
+        (sample.duration & 0xFF000000) >>> 24,
+        (sample.duration & 0xFF0000) >>> 16,
+        (sample.duration & 0xFF00) >>> 8,
+        sample.duration & 0xFF, // sample_duration
+        (sample.size & 0xFF000000) >>> 24,
+        (sample.size & 0xFF0000) >>> 16,
+        (sample.size & 0xFF00) >>> 8,
+        sample.size & 0xFF, // sample_size
+        (sample.flags.isLeading << 2) | sample.flags.dependsOn,
+        (sample.flags.isDependedOn << 6) |
+          (sample.flags.hasRedundancy << 4) |
+          (sample.flags.paddingValue << 1) |
+          sample.flags.isNonSyncSample,
+        sample.flags.degradationPriority & 0xF0 << 8,
+        sample.flags.degradationPriority & 0x0F, // sample_flags
+        (sample.compositionTimeOffset & 0xFF000000) >>> 24,
+        (sample.compositionTimeOffset & 0xFF0000) >>> 16,
+        (sample.compositionTimeOffset & 0xFF00) >>> 8,
+        sample.compositionTimeOffset & 0xFF // sample_composition_time_offset
+      ]);
+    }
+    return box(types.trun, new Uint8Array(bytes));
+  };

-  for (i = 0; i < samples.length; i++) {
-    sample = samples[i];
-    bytes = bytes.concat([
-      (sample.duration & 0xFF000000) >>> 24,
-      (sample.duration & 0xFF0000) >>> 16,
-      (sample.duration & 0xFF00) >>> 8,
-      sample.duration & 0xFF, // sample_duration
-      (sample.size & 0xFF000000) >>> 24,
-      (sample.size & 0xFF0000) >>> 16,
-      (sample.size & 0xFF00) >>> 8,
-      sample.size & 0xFF, // sample_size
-      (sample.flags.isLeading << 2) | sample.flags.dependsOn,
-      (sample.flags.isDependedOn << 6) |
-        (sample.flags.hasRedundancy << 4) |
-        (sample.flags.paddingValue << 1) |
-        sample.flags.isNonSyncSample,
-      sample.flags.degradationPriority & 0xF0 << 8,
-      sample.flags.degradationPriority & 0x0F, // sample_flags
-      (sample.compositionTimeOffset & 0xFF000000) >>> 24,
-      (sample.compositionTimeOffset & 0xFF0000) >>> 16,
-      (sample.compositionTimeOffset & 0xFF00) >>> 8,
-      sample.compositionTimeOffset & 0xFF // sample_composition_time_offset
-    ]);
-  }
-  return box(types.trun, new Uint8Array(bytes));
-};
+  audioTrun = function(track, offset) {
+    var bytes, samples, sample, i;
+
+    samples = track.samples || [];
+    offset += 8 + 12 + (8 * samples.length);
+
+    bytes = trunHeader(samples, offset);
+
+    for (i = 0; i < samples.length; i++) {
+      sample = samples[i];
+      bytes = bytes.concat([
+        (sample.duration & 0xFF000000) >>> 24,
+        (sample.duration & 0xFF0000) >>> 16,
+        (sample.duration & 0xFF00) >>> 8,
+        sample.duration & 0xFF, // sample_duration
+        (sample.size & 0xFF000000) >>> 24,
+        (sample.size & 0xFF0000) >>> 16,
+        (sample.size & 0xFF00) >>> 8,
+        sample.size & 0xFF]); // sample_size
+    }
+
+    return box(types.trun, new Uint8Array(bytes));
+  };
+
+  trun = function(track, offset) {
+    if (track.type === 'audio') {
+      return audioTrun(track, offset);
+    } else {
+      return videoTrun(track, offset);
+    }
+  };
+})();

 window.videojs.mp4 = {
  ftyp: ftyp,
--- a/src/transmuxer.js
View file @72a6a13
+++ b/src/transmuxer.js
View file @72a6a13
@@ -486,7 +486,8 @@ AudioSegmentStream = function(track) {
    // concatenate the audio data to constuct the mdat
    data = new Uint8Array(aacFramesLength);
    track.samples = [];
-    while (aacFramesLength.length) {
+    i = 0;
+    while (aacFrames.length) {
      currentFrame = aacFrames[0];
      sample = {
        size: currentFrame.data.byteLength,
--- a/test/mp4-generator_test.js
View file @72a6a13
+++ b/test/mp4-generator_test.js
View file @72a6a13
@@ -518,6 +518,35 @@ test('generates a minimal moof', function() {
  }, 'wrote the sample data table');
 });

+test('generates a moof for audio', function() {
+  var
+    data = mp4.moof(7, [{
+      id: 17,
+      type: 'audio',
+      samples: [{
+        duration: 9000,
+        size: 10
+      }, {
+        duration: 10000,
+        size: 11
+      }]
+    }]),
+    moof = videojs.inspectMp4(data),
+    trun;
+
+  deepEqual(moof[0].boxes[1].boxes.length, 3, 'generated three traf children');
+  trun = moof[0].boxes[1].boxes[2];
+  ok(trun, 'generated a trun');
+  deepEqual(trun.dataOffset, data.byteLength + 8, 'calculated the data offset');
+  deepEqual(trun.samples, [{
+    duration: 9000,
+    size: 10
+  }, {
+    duration: 10000,
+    size: 11
+  }], 'wrote simple audio samples');
+});
+
 test('can generate a traf without samples', function() {
  var
    data = mp4.moof(8, [{
--- a/test/muxer/js/mp4-inspector.js
View file @72a6a13
+++ b/test/muxer/js/mp4-inspector.js
View file @72a6a13
@@ -39,6 +39,12 @@ var
    for (i = 0; i < avcStream.length; i += length) {
      length = avcView.getUint32(i);
      i += 4;
+
+      // bail if this doesn't appear to be an H264 stream
+      if (length <= 0) {
+        return;
+      }
+
      switch(avcStream[i] & 0x1F) {
      case 0x01:
        result.push('slice_layer_without_partitioning_rbsp');