Generate a valid audio initialization segment

Modify the mp4 generator to inspect audio tracks and generate a working initialization segment. Hook the audio init segment up to the mp4 transmuxing test page.

Generate a valid audio initialization segment
Modify the mp4 generator to inspect audio tracks and generate a working initialization segment. Hook the audio init segment up to the mp4 transmuxing test page.
David LaPalomento
Showing 9 changed files with 831 additions and 256 deletions
src/mp4-generator.js
src/transmuxer.js
test/mp4-generator_test.js
test/mp4-inspector_test.js
test/muxer/js/mp4-inspector.js
test/muxer/mp4.html
test/muxer/mse-demo.html
test/muxer/player.html
test/transmuxer_test.js
--- a/src/mp4-generator.js
View file @def510a
+++ b/src/mp4-generator.js
View file @def510a
@@ -4,7 +4,7 @@
 var box, dinf, ftyp, mdat, mfhd, minf, moof, moov, mvex, mvhd, trak,
    tkhd, mdia, mdhd, hdlr, sdtp, stbl, stsd, styp, traf, trex, trun,
    types, MAJOR_BRAND, MINOR_VERSION, AVC1_BRAND, VIDEO_HDLR,
-    AUDIO_HDLR, HDLR_TYPES, VMHD, DREF, STCO, STSC, STSZ, STTS,
+    AUDIO_HDLR, HDLR_TYPES, ESDS, VMHD, SMHD, DREF, STCO, STSC, STSZ, STTS,
    Uint8Array, DataView;
 Uint8Array = window.Uint8Array;
@@ -19,6 +19,7 @@ DataView = window.DataView;
    btrt: [],
    dinf: [],
    dref: [],
+    esds: [],
    ftyp: [],
    hdlr: [],
    mdat: [],
@@ -28,9 +29,11 @@ DataView = window.DataView;
    minf: [],
    moof: [],
    moov: [],
+    mp4a: [], // codingname
    mvex: [],
    mvhd: [],
    sdtp: [],
+    smhd: [],
    stbl: [],
    stco: [],
    stsc: [],
@@ -109,6 +112,39 @@ DataView = window.DataView;
    0x00, // version 0
    0x00, 0x00, 0x01 // entry_flags
  ]);
+  ESDS = new Uint8Array([
+    0x00, // version
+    0x00, 0x00, 0x00, // flags
+    // ES_Descriptor
+    0x03, // tag, ES_DescrTag
+    0x19, // length
+    0x00, 0x00, // ES_ID
+    0x00, // streamDependenceFlag, URL_flag, reserved, streamPriority
+    // DecoderConfigDescriptor
+    0x04, // tag, DecoderConfigDescrTag
+    0x11, // length
+    0x40, // object type
+    0x15,  // streamType
+    0x00, 0x06, 0x00, // bufferSizeDB
+    0x00, 0x00, 0xda, 0xc0, // maxBitrate
+    0x00, 0x00, 0xda, 0xc0, // avgBitrate
+    // DecoderSpecificInfo
+    0x05, // tag, DecoderSpecificInfoTag
+    0x02, // length
+    // ISO/IEC 14496-3, AudioSpecificConfig
+    0x11, // AudioObjectType, AAC LC.
+    0x90, // samplingFrequencyIndex, 8 -> 16000. channelConfig, 2 -> stereo.
+    0x06, 0x01, 0x02 // GASpecificConfig
+  ]);
+  SMHD = new Uint8Array([
+    0x00,             // version
+    0x00, 0x00, 0x00, // flags
+    0x00, 0x00,       // balance, 0 means centered
+    0x00, 0x00        // reserved
+  ]);
  STCO = new Uint8Array([
    0x00, // version
    0x00, 0x00, 0x00, // flags
@@ -171,24 +207,35 @@ hdlr = function(type) {
 mdat = function(data) {
  return box(types.mdat, data);
 };
-mdhd = function(duration) {
+mdhd = function(track) {
-  return box(types.mdhd, new Uint8Array([
+  var result = new Uint8Array([
-    0x00, // version 0
+    0x00,                   // version 0
-    0x00, 0x00, 0x00, // flags
+    0x00, 0x00, 0x00,       // flags
    0x00, 0x00, 0x00, 0x02, // creation_time
    0x00, 0x00, 0x00, 0x03, // modification_time
    0x00, 0x01, 0x5f, 0x90, // timescale, 90,000 "ticks" per second
-    (duration & 0xFF000000) >> 24,
+    (track.duration >>> 24),
-    (duration & 0xFF0000) >> 16,
+    (track.duration >>> 16) & 0xFF,
-    (duration & 0xFF00) >> 8,
+    (track.duration >>>  8) & 0xFF,
-    duration & 0xFF, // duration
+    track.duration & 0xFF,  // duration
-    0x55, 0xc4, // 'und' language (undetermined)
+    0x55, 0xc4,             // 'und' language (undetermined)
    0x00, 0x00
-  ]));
+  ]);
+  // Use the sample rate from the track metadata, when it is
+  // defined. The sample rate can be parsed out of an ADTS header, for
+  // instance.
+  if (track.samplerate) {
+    result[12] = (track.samplerate >>> 24);
+    result[13] = (track.samplerate >>> 16) & 0xFF;
+    result[14] = (track.samplerate >>>  8) & 0xFF;
+    result[15] = (track.samplerate)        & 0xFF;
+  }
+  return box(types.mdhd, result);
 };
 mdia = function(track) {
-  return box(types.mdia, mdhd(track.duration), hdlr(track.type), minf(track));
+  return box(types.mdia, mdhd(track), hdlr(track.type), minf(track));
 };
 mfhd = function(sequenceNumber) {
  return box(types.mfhd, new Uint8Array([
@@ -201,7 +248,10 @@ mfhd = function(sequenceNumber) {
  ]));
 };
 minf = function(track) {
-  return box(types.minf, box(types.vmhd, VMHD), dinf(), stbl(track));
+  return box(types.minf,
+             track.type === 'video' ? box(types.vmhd, VMHD) : box(types.smhd, SMHD),
+             dinf(),
+             stbl(track));
 };
 moof = function(sequenceNumber, tracks) {
  var
@@ -217,7 +267,9 @@ moof = function(sequenceNumber, tracks) {
  ].concat(trackFragments));
 };
 /**
- * @param tracks... (optional) {array} the tracks associated with this movie
+ * Returns a movie box.
+ * @param tracks {array} the tracks associated with this movie
+ * @see ISO/IEC 14496-12:2012(E), section 8.2.1
 */
 moov = function(tracks) {
  var
@@ -307,32 +359,36 @@ stbl = function(track) {
             box(types.stco, STCO));
 };
-stsd = function(track) {
+(function() {
-  var sequenceParameterSets = [], pictureParameterSets = [], i;
+  var videoSample, audioSample;
-  if (track.type === 'audio') {
+  stsd = function(track) {
-    return box(types.stsd);
-  }
-  // assemble the SPSs
+    return box(types.stsd, new Uint8Array([
-  for (i = 0; i < track.sps.length; i++) {
+      0x00, // version 0
-    sequenceParameterSets.push((track.sps[i].byteLength & 0xFF00) >>> 8);
+      0x00, 0x00, 0x00, // flags
-    sequenceParameterSets.push((track.sps[i].byteLength & 0xFF)); // sequenceParameterSetLength
+      0x00, 0x00, 0x00, 0x01
-    sequenceParameterSets = sequenceParameterSets.concat(Array.prototype.slice.call(track.sps[i])); // SPS
+    ]), track.type === 'video' ? videoSample(track) : audioSample(track));
-  }
+  };
-  // assemble the PPSs
+  videoSample = function(track) {
-  for (i = 0; i < track.pps.length; i++) {
+    var sequenceParameterSets = [], pictureParameterSets = [], i;
-    pictureParameterSets.push((track.pps[i].byteLength & 0xFF00) >>> 8);
-    pictureParameterSets.push((track.pps[i].byteLength & 0xFF));
-    pictureParameterSets = pictureParameterSets.concat(Array.prototype.slice.call(track.pps[i]));
-  }
-  return box(types.stsd, new Uint8Array([
+    // assemble the SPSs
-    0x00, // version 0
+    for (i = 0; i < track.sps.length; i++) {
-    0x00, 0x00, 0x00, // flags
+      sequenceParameterSets.push((track.sps[i].byteLength & 0xFF00) >>> 8);
-    0x00, 0x00, 0x00, 0x01]),
+      sequenceParameterSets.push((track.sps[i].byteLength & 0xFF)); // sequenceParameterSetLength
-    box(types.avc1, new Uint8Array([
+      sequenceParameterSets = sequenceParameterSets.concat(Array.prototype.slice.call(track.sps[i])); // SPS
+    }
+    // assemble the PPSs
+    for (i = 0; i < track.pps.length; i++) {
+      pictureParameterSets.push((track.pps[i].byteLength & 0xFF00) >>> 8);
+      pictureParameterSets.push((track.pps[i].byteLength & 0xFF));
+      pictureParameterSets = pictureParameterSets.concat(Array.prototype.slice.call(track.pps[i]));
+    }
+    return box(types.avc1, new Uint8Array([
      0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, // reserved
      0x00, 0x01, // data_reference_index
@@ -359,31 +415,60 @@ stsd = function(track) {
      0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, // compressorname
      0x00, 0x18, // depth = 24
-      0x11, 0x11]), // pre_defined = -1
+      0x11, 0x11 // pre_defined = -1
-        box(types.avcC, new Uint8Array([
+    ]), box(types.avcC, new Uint8Array([
-          0x01, // configurationVersion
+      0x01, // configurationVersion
-          track.profileIdc, // AVCProfileIndication
+      track.profileIdc, // AVCProfileIndication
-          track.profileCompatibility, // profile_compatibility
+      track.profileCompatibility, // profile_compatibility
-          track.levelIdc, // AVCLevelIndication
+      track.levelIdc, // AVCLevelIndication
-          0xff // lengthSizeMinusOne, hard-coded to 4 bytes
+      0xff // lengthSizeMinusOne, hard-coded to 4 bytes
-        ].concat([
+    ].concat([
-          track.sps.length // numOfSequenceParameterSets
+      track.sps.length // numOfSequenceParameterSets
-        ]).concat(sequenceParameterSets).concat([
+    ]).concat(sequenceParameterSets).concat([
-          track.pps.length // numOfPictureParameterSets
+      track.pps.length // numOfPictureParameterSets
-        ]).concat(pictureParameterSets))), // "PPS"
+    ]).concat(pictureParameterSets))), // "PPS"
-        box(types.btrt, new Uint8Array([
+            box(types.btrt, new Uint8Array([
-          0x00, 0x1c, 0x9c, 0x80, // bufferSizeDB
+              0x00, 0x1c, 0x9c, 0x80, // bufferSizeDB
-          0x00, 0x2d, 0xc6, 0xc0, // maxBitrate
+              0x00, 0x2d, 0xc6, 0xc0, // maxBitrate
-          0x00, 0x2d, 0xc6, 0xc0])) // avgBitrate
+              0x00, 0x2d, 0xc6, 0xc0
-        ));
+            ])) // avgBitrate
-};
+              );
+  };
+  audioSample = function(track) {
+    return box(types.mp4a, new Uint8Array([
+      // SampleEntry, ISO/IEC 14496-12
+      0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, // reserved
+      0x00, 0x01, // data_reference_index
+      // AudioSampleEntry, ISO/IEC 14496-12
+      0x00, 0x00, 0x00, 0x00, // reserved
+      0x00, 0x00, 0x00, 0x00, // reserved
+      (track.channelcount & 0xff00) >> 8,
+      (track.channelcount & 0xff), // channelcount
+      (track.samplesize & 0xff00) >> 8,
+      (track.samplesize & 0xff), // samplesize
+      0x00, 0x00, // pre_defined
+      0x00, 0x00, // reserved
+      (track.samplerate & 0xff00) >> 8,
+      (track.samplerate & 0xff),
+      0x00, 0x00 // samplerate, 16.16
+      // MP4AudioSampleEntry, ISO/IEC 14496-14
+    ]), box(types.esds, ESDS));
+  };
+})();
 styp = function() {
  return box(types.styp, MAJOR_BRAND, MINOR_VERSION, MAJOR_BRAND);
 };
 tkhd = function(track) {
-  return box(types.tkhd, new Uint8Array([
+  var result = new Uint8Array([
    0x00, // version 0
    0x00, 0x00, 0x07, // flags
    0x00, 0x00, 0x00, 0x00, // creation_time
@@ -401,7 +486,7 @@ tkhd = function(track) {
    0x00, 0x00, 0x00, 0x00, // reserved
    0x00, 0x00, // layer
    0x00, 0x00, // alternate_group
-    0x00, 0x00, // non-audio track volume
+    0x01, 0x00, // non-audio track volume
    0x00, 0x00, // reserved
    0x00, 0x01, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00,
@@ -418,7 +503,9 @@ tkhd = function(track) {
    (track.height & 0xFF00) >> 8,
    track.height & 0xFF,
    0x00, 0x00 // height
-  ]));
+  ]);
+  return box(types.tkhd, result);
 };
 traf = function(track) {
@@ -461,7 +548,7 @@ trak = function(track) {
 };
 trex = function(track) {
-  return box(types.trex, new Uint8Array([
+  var result = new Uint8Array([
    0x00, // version 0
    0x00, 0x00, 0x00, // flags
    (track.id & 0xFF000000) >> 24,
@@ -472,7 +559,16 @@ trex = function(track) {
    0x00, 0x00, 0x00, 0x00, // default_sample_duration
    0x00, 0x00, 0x00, 0x00, // default_sample_size
    0x00, 0x01, 0x00, 0x01 // default_sample_flags
-  ]));
+  ]);
+  // the last two bytes of default_sample_flags is the sample
+  // degradation priority, a hint about the importance of this sample
+  // relative to others. Lower the degradation priority for all sample
+  // types other than video.
+  if (track.type !== 'video') {
+    result[result.length - 1] = 0x00;
+  }
+  return box(types.trex, result);
 };
 trun = function(track, offset) {
--- a/src/transmuxer.js
View file @def510a
+++ b/src/transmuxer.js
View file @def510a
@@ -16,12 +16,29 @@
 var
  TransportPacketStream, TransportParseStream, ElementaryStream, VideoSegmentStream,
-  Transmuxer, AacStream, H264Stream, NalByteStream,
+  AudioSegmentStream, Transmuxer, AacStream, H264Stream, NalByteStream,
-  MP2T_PACKET_LENGTH, H264_STREAM_TYPE, ADTS_STREAM_TYPE, mp4;
+  MP2T_PACKET_LENGTH, H264_STREAM_TYPE, ADTS_STREAM_TYPE,
+  ADTS_SAMPLING_FREQUENCIES, mp4;
 MP2T_PACKET_LENGTH = 188; // bytes
 H264_STREAM_TYPE = 0x1b;
 ADTS_STREAM_TYPE = 0x0f;
+ADTS_SAMPLING_FREQUENCIES = [
+  96000,
+  88200,
+  64000,
+  48000,
+  44100,
+  32000,
+  24000,
+  22050,
+  16000,
+  12000,
+  11025,
+  8000,
+  7350
+];
 mp4 = videojs.mp4;
 /**
@@ -438,6 +455,11 @@ AacStream = function() {
        // deliver the AAC frame
        this.trigger('data', {
+          channelcount: ((buffer[i + 1] & 1) << 3) | 
+            ((buffer[i + 2] & 0xc0) >> 6),
+          samplerate: ADTS_SAMPLING_FREQUENCIES[(buffer[i + 1] & 0x3c) >> 2],
+          // assume ISO/IEC 14496-12 AudioSampleEntry default of 16
+          samplesize: 16,
          data: buffer.subarray(i + 6, i + frameLength - 1)
        });
@@ -457,6 +479,62 @@ AacStream = function() {
 AacStream.prototype = new videojs.Hls.Stream();
 /**
+ * Constructs a single-track, ISO BMFF media segment from AAC data
+ * events. The output of this stream can be fed to a SourceBuffer
+ * configured with a suitable initialization segment.
+ */
+// TODO: share common code with VideoSegmentStream
+AudioSegmentStream = function(track) {
+  var aacFrames = [], aacFramesLength = 0, sequenceNumber = 0;
+  AudioSegmentStream.prototype.init.call(this);
+  this.push = function(data) {
+    // buffer audio data until end() is called
+    aacFrames.push(data);
+    aacFramesLength += data.data.byteLength;
+  };
+  this.end = function() {
+    var boxes, currentFrame, data, sample, i, mdat, moof;
+    // return early if no audio data has been observed
+    if (aacFramesLength === 0) {
+      return;
+    }
+    // concatenate the audio data to constuct the mdat
+    data = new Uint8Array(aacFramesLength);
+    track.samples = [];
+    while (aacFramesLength.length) {
+      currentFrame = aacFrames[0];
+      sample = {
+        size: currentFrame.data.byteLength,
+        duration: 1024 // FIXME calculate for realz
+      };
+      track.samples.push(sample);
+      data.set(currentFrame.data, i);
+      i += currentFrame.data.byteLength;
+      aacFrames.shift();
+    }
+    aacFramesLength = 0;
+    mdat = mp4.mdat(data);
+    moof = mp4.moof(sequenceNumber, [track]);
+    boxes = new Uint8Array(moof.byteLength + mdat.byteLength);
+    // bump the sequence number for next time
+    sequenceNumber++;
+    boxes.set(moof);
+    boxes.set(mdat, moof.byteLength);
+    this.trigger('data', boxes);
+  };
+};
+AudioSegmentStream.prototype = new videojs.Hls.Stream();
+/**
 * Accepts a NAL unit byte stream and unpacks the embedded NAL units.
 */
 NalByteStream = function() {
@@ -539,7 +617,7 @@ NalByteStream = function() {
  this.end = function() {
    // deliver the last buffered NAL unit
-    if (buffer.byteLength > 3) {
+    if (buffer && buffer.byteLength > 3) {
      this.trigger('data', buffer.subarray(syncPoint + 3));
    }
  };
@@ -763,12 +841,19 @@ VideoSegmentStream = function(track) {
  this.end = function() {
    var startUnit, currentNal, moof, mdat, boxes, i, data, view, sample;
+    // return early if no video data has been observed
+    if (nalUnitsLength === 0) {
+      return;
+    }
    // concatenate the video data and construct the mdat
    // first, we have to build the index from byte locations to
    // samples (that is, frames) in the video data
    data = new Uint8Array(nalUnitsLength + (4 * nalUnits.length));
    view = new DataView(data.buffer);
    track.samples = [];
+    // see ISO/IEC 14496-12:2012, section 8.6.4.3
    sample = {
      size: 0,
      flags: {
@@ -853,11 +938,14 @@ VideoSegmentStream.prototype = new videojs.Hls.Stream();
 Transmuxer = function() {
  var
    self = this,
-    track,
+    videoTrack,
+    audioTrack,
    config,
    pps,
-    packetStream, parseStream, elementaryStream, aacStream, h264Stream, videoSegmentStream;
+    packetStream, parseStream, elementaryStream,
+    aacStream, h264Stream, 
+    videoSegmentStream, audioSegmentStream;
  Transmuxer.prototype.init.call(this);
@@ -880,51 +968,78 @@ Transmuxer = function() {
        !config) {
      config = data.config;
-      track.width = config.width;
+      videoTrack.width = config.width;
-      track.height = config.height;
+      videoTrack.height = config.height;
-      track.sps = [data.data];
+      videoTrack.sps = [data.data];
-      track.profileIdc = config.profileIdc;
+      videoTrack.profileIdc = config.profileIdc;
-      track.levelIdc = config.levelIdc;
+      videoTrack.levelIdc = config.levelIdc;
-      track.profileCompatibility = config.profileCompatibility;
+      videoTrack.profileCompatibility = config.profileCompatibility;
      // generate an init segment once all the metadata is available
      if (pps) {
        self.trigger('data', {
-          data: videojs.mp4.initSegment([track])
+          type: 'video',
+          data: videojs.mp4.initSegment([videoTrack])
        });
      }
    }
    if (data.nalUnitType === 'pic_parameter_set_rbsp' &&
        !pps) {
      pps = data.data;
-      track.pps = [data.data];
+      videoTrack.pps = [data.data];
      if (config) {
        self.trigger('data', {
-          data: videojs.mp4.initSegment([track])
+          type: 'video',
+          data: videojs.mp4.initSegment([videoTrack])
        });
      }
    }
  });
-  // hook up the video segment stream once track metadata is delivered
+  // generate an init segment based on the first audio sample
-  elementaryStream.on('data', function(data) {
+  aacStream.on('data', function(data) {
-    var i, triggerData = function(segment) {
+    if (audioTrack && audioTrack.channelcount === undefined) {
+      audioTrack.channelcount = data.channelcount;
+      audioTrack.samplerate = data.samplerate;
+      audioTrack.samplesize = data.samplesize;
      self.trigger('data', {
-        data: segment
+        type: 'audio',
+        data: videojs.mp4.initSegment([audioTrack])
      });
+    }
+  });
+  // hook up the segment streams once track metadata is delivered
+  elementaryStream.on('data', function(data) {
+    var i, triggerData = function(type) {
+      return function(segment) {
+        self.trigger('data', {
+          type: type,
+          data: segment
+        });
+      };
    };
    if (data.type === 'metadata') {
      i = data.tracks.length;
+      // scan the tracks listed in the metadata
      while (i--) {
-        if (data.tracks[i].type === 'video') {
-          track = data.tracks[i];
+        // hook up the video segment stream to the first track with h264 data
-          if (!videoSegmentStream) {
+        if (data.tracks[i].type === 'video' && !videoSegmentStream) {
-            videoSegmentStream = new VideoSegmentStream(track);
+          videoTrack = data.tracks[i];
-            h264Stream.pipe(videoSegmentStream);
+          videoSegmentStream = new VideoSegmentStream(videoTrack);
-            videoSegmentStream.on('data', triggerData);
+          h264Stream.pipe(videoSegmentStream);
-          }
+          videoSegmentStream.on('data', triggerData('video'));
          break;
        }
+        // hook up the audio segment stream to the first track with aac data
+        if (data.tracks[i].type === 'audio' && !audioSegmentStream) {
+          audioTrack = data.tracks[i];
+          audioSegmentStream = new AudioSegmentStream(audioTrack);
+          aacStream.pipe(audioSegmentStream);
+          audioSegmentStream.on('data', triggerData('audio'));
+        }
      }
    }
  });
@@ -938,6 +1053,7 @@ Transmuxer = function() {
    elementaryStream.end();
    h264Stream.end();
    videoSegmentStream.end();
+    audioSegmentStream.end();
  };
 };
 Transmuxer.prototype = new videojs.Hls.Stream();
--- a/test/mp4-generator_test.js
View file @def510a
+++ b/test/mp4-generator_test.js
View file @def510a
@@ -22,7 +22,11 @@
 */
 var
  mp4 = videojs.mp4,
-  inspectMp4 = videojs.inspectMp4;
+  inspectMp4 = videojs.inspectMp4,
+  validateMvhd, validateTrak, validateTkhd, validateMdia,
+  validateMdhd, validateHdlr, validateMinf, validateDinf,
+  validateStbl, validateStsd, validateMvex,
+  validateVideoSample, validateAudioSample;
 module('MP4 Generator');
@@ -39,72 +43,90 @@ test('generates a BSMFF ftyp', function() {
  equal(boxes[0].minorVersion, 1, 'minor version is one');
 });
-test('generates a moov', function() {
+validateMvhd = function(mvhd) {
-  var boxes, mvhd, tkhd, mdhd, hdlr, minf, mvex,
-    data = mp4.moov([{
-      id: 7,
-      duration: 100,
-      width: 600,
-      height: 300,
-      type: 'video',
-      profileIdc: 3,
-      levelIdc: 5,
-      profileCompatibility: 7,
-      sps: [new Uint8Array([0, 1, 2]), new Uint8Array([3, 4, 5])],
-      pps: [new Uint8Array([6, 7, 8])]
-    }]);
-  ok(data, 'box is not null');
-  boxes = inspectMp4(data);
-  equal(boxes.length, 1, 'generated a single box');
-  equal(boxes[0].type, 'moov', 'generated a moov type');
-  equal(boxes[0].size, data.byteLength, 'generated size');
-  equal(boxes[0].boxes.length, 3, 'generated three sub boxes');
-  mvhd = boxes[0].boxes[0];
  equal(mvhd.type, 'mvhd', 'generated a mvhd');
  equal(mvhd.duration, 0xffffffff, 'wrote the maximum movie header duration');
  equal(mvhd.nextTrackId, 0xffffffff, 'wrote the max next track id');
+};
-  equal(boxes[0].boxes[1].type, 'trak', 'generated a trak');
+validateTrak = function(trak, expected) {
-  equal(boxes[0].boxes[1].boxes.length, 2, 'generated two track sub boxes');
+  expected = expected || {};
-  tkhd = boxes[0].boxes[1].boxes[0];
+  equal(trak.type, 'trak', 'generated a trak');
+  equal(trak.boxes.length, 2, 'generated two track sub boxes');
+  validateTkhd(trak.boxes[0], expected);
+  validateMdia(trak.boxes[1], expected);
+};
+validateTkhd = function(tkhd, expected) {
  equal(tkhd.type, 'tkhd', 'generated a tkhd');
  equal(tkhd.trackId, 7, 'wrote the track id');
  deepEqual(tkhd.flags, new Uint8Array([0, 0, 7]), 'flags should equal 7');
-  equal(tkhd.duration, 100, 'wrote duration into the track header');
+  equal(tkhd.duration,
-  equal(tkhd.width, 600, 'wrote width into the track header');
+        expected.duration || Math.pow(2, 32) - 1,
-  equal(tkhd.height, 300, 'wrote height into the track header');
+        'wrote duration into the track header');
+  equal(tkhd.width, expected.width || 0, 'wrote width into the track header');
-  equal(boxes[0].boxes[1].boxes[1].type, 'mdia', 'generated an mdia type');
+  equal(tkhd.height, expected.height || 0, 'wrote height into the track header');
-  equal(boxes[0].boxes[1].boxes[1].boxes.length, 3, 'generated three track media sub boxes');
+  equal(tkhd.volume, 1, 'set volume to 1');
+};
-  mdhd = boxes[0].boxes[1].boxes[1].boxes[0];
+validateMdia = function(mdia, expected) {
+  equal(mdia.type, 'mdia', 'generated an mdia type');
+  equal(mdia.boxes.length, 3, 'generated three track media sub boxes');
+  validateMdhd(mdia.boxes[0], expected);
+  validateHdlr(mdia.boxes[1], expected);
+  validateMinf(mdia.boxes[2], expected);
+};
+validateMdhd = function(mdhd, expected) {
  equal(mdhd.type, 'mdhd', 'generate an mdhd type');
  equal(mdhd.language, 'und', 'wrote undetermined language');
-  equal(mdhd.duration, 100, 'wrote duration into the media header');
+  equal(mdhd.timescale, expected.timescale || 90000, 'wrote the timescale');
+  equal(mdhd.duration,
+        expected.duration || Math.pow(2, 32) - 1,
+        'wrote duration into the media header');
+};
-  hdlr = boxes[0].boxes[1].boxes[1].boxes[1];
+validateHdlr = function(hdlr, expected) {
  equal(hdlr.type, 'hdlr', 'generate an hdlr type');
-  equal(hdlr.handlerType, 'vide', 'wrote a video handler');
+  if (expected.type !== 'audio') {
-  equal(hdlr.name, 'VideoHandler', 'wrote the handler name');
+    equal(hdlr.handlerType, 'vide', 'wrote a video handler');
+    equal(hdlr.name, 'VideoHandler', 'wrote the handler name');
-  minf = boxes[0].boxes[1].boxes[1].boxes[2];
+  } else {
+    equal(hdlr.handlerType, 'soun', 'wrote a sound handler');
+    equal(hdlr.name, 'SoundHandler', 'wrote the sound handler name');
+  }
+};
+validateMinf = function(minf, expected) {
  equal(minf.type, 'minf', 'generate an minf type');
  equal(minf.boxes.length, 3, 'generates three minf sub boxes');
-  equal(minf.boxes[0].type, 'vmhd', 'generates a vmhd type');
+  if (expected.type !== 'audio') {
-  deepEqual({
+    deepEqual({
-    type: 'vmhd',
+      type: 'vmhd',
-    size: 20,
+      size: 20,
-    version: 0,
+      version: 0,
-    flags: new Uint8Array([0, 0, 1]),
+      flags: new Uint8Array([0, 0, 1]),
-    graphicsmode: 0,
+      graphicsmode: 0,
-    opcolor: new Uint16Array([0, 0, 0])
+      opcolor: new Uint16Array([0, 0, 0])
-  }, minf.boxes[0], 'generates a vhmd');
+    }, minf.boxes[0], 'generates a vhmd');
+  } else {
+    deepEqual({
+      type: 'smhd',
+      size: 16,
+      version: 0,
+      flags: new Uint8Array([0, 0, 0]),
+      balance: 0
+    }, minf.boxes[0], 'generates an smhd');
+  }
-  equal(minf.boxes[1].type, 'dinf', 'generates a dinf type');
+  validateDinf(minf.boxes[1]);
+  validateStbl(minf.boxes[2], expected);
+};
+validateDinf = function(dinf) {
  deepEqual({
    type: 'dinf',
    size: 36,
@@ -120,82 +142,123 @@ test('generates a moov', function() {
        flags: new Uint8Array([0, 0, 1])
      }]
    }]
-  }, minf.boxes[1], 'generates a dinf');
+  }, dinf, 'generates a dinf');
+};
+validateStbl = function(stbl, expected) {
+  equal(stbl.type, 'stbl', 'generates an stbl type');
+  equal(stbl.boxes.length, 5, 'generated five stbl child boxes');
-  equal(minf.boxes[2].type, 'stbl', 'generates an stbl type');
+  validateStsd(stbl.boxes[0], expected);
  deepEqual({
-    type: 'stbl',
+    type: 'stts',
-    size: 228,
+    size: 16,
-    boxes: [{
+    version: 0,
-      type: 'stsd',
+    flags: new Uint8Array([0, 0, 0]),
-      size: 152,
+    timeToSamples: []
-      version: 0,
+  }, stbl.boxes[1], 'generated an stts');
-      flags: new Uint8Array([0, 0, 0]),
+  deepEqual({
-      sampleDescriptions: [{
+    type: 'stsc',
-        type: 'avc1',
+    size: 16,
-        size: 136,
+    version: 0,
-        dataReferenceIndex: 1,
+    flags: new Uint8Array([0, 0, 0]),
-        width: 600,
+    sampleToChunks: []
-        height: 300,
+  }, stbl.boxes[2], 'generated an stsc');
-        horizresolution: 72,
+  deepEqual({
-        vertresolution: 72,
+    type: 'stsz',
-        frameCount: 1,
+    version: 0,
-        depth: 24,
+    size: 20,
-        config: [{
+    flags: new Uint8Array([0, 0, 0]),
-          type: 'avcC',
+    sampleSize: 0,
-          size: 30,
+    entries: []
-          configurationVersion: 1,
+  }, stbl.boxes[3], 'generated an stsz');
-          avcProfileIndication: 3,
+  deepEqual({
-          avcLevelIndication: 5,
+    type: 'stco',
-          profileCompatibility: 7,
+    size: 16,
-          lengthSizeMinusOne: 3,
+    version: 0,
-          sps: [new Uint8Array([
+    flags: new Uint8Array([0, 0, 0]),
-            0, 1, 2
+    chunkOffsets: []
-          ]), new Uint8Array([
+  }, stbl.boxes[4], 'generated and stco');
-            3, 4, 5
+};
-          ])],
-          pps: [new Uint8Array([
+validateStsd = function(stsd, expected) {
-            6, 7, 8
+  equal(stsd.type, 'stsd', 'generated an stsd');
-          ])]
+  equal(stsd.sampleDescriptions.length, 1, 'generated one sample');
-        }, {
+  if (expected.type !== 'audio') {
-          type: 'btrt',
+    validateVideoSample(stsd.sampleDescriptions[0]);
-          size: 20,
+  } else {
-          bufferSizeDB: 1875072,
+    validateAudioSample(stsd.sampleDescriptions[0]);
-          maxBitrate: 3000000,
+  }
-          avgBitrate: 3000000
+};
-        }]
-      }]
+validateVideoSample = function(sample) {
-    }, {
+  deepEqual(sample, {
-      type: 'stts',
+    type: 'avc1',
-      size: 16,
+    size: 136,
-      version: 0,
+    dataReferenceIndex: 1,
-      flags: new Uint8Array([0, 0, 0]),
+    width: 600,
-      timeToSamples: []
+    height: 300,
-    }, {
+    horizresolution: 72,
-      type: 'stsc',
+    vertresolution: 72,
-      size: 16,
+    frameCount: 1,
-      version: 0,
+    depth: 24,
-      flags: new Uint8Array([0, 0, 0]),
+    config: [{
-      sampleToChunks: []
+      type: 'avcC',
+      size: 30,
+      configurationVersion: 1,
+      avcProfileIndication: 3,
+      avcLevelIndication: 5,
+      profileCompatibility: 7,
+      lengthSizeMinusOne: 3,
+      sps: [new Uint8Array([
+        0, 1, 2
+      ]), new Uint8Array([
+        3, 4, 5
+      ])],
+      pps: [new Uint8Array([
+        6, 7, 8
+      ])]
    }, {
-      type: 'stsz',
+      type: 'btrt',
-      version: 0,
      size: 20,
-      flags: new Uint8Array([0, 0, 0]),
+      bufferSizeDB: 1875072,
-      sampleSize: 0,
+      maxBitrate: 3000000,
-      entries: []
+      avgBitrate: 3000000
-    }, {
+    }]
-      type: 'stco',
+  }, 'generated a video sample');
-      size: 16,
+};
+validateAudioSample = function(sample) {
+  deepEqual(sample, {
+    type: 'mp4a',
+    size: 75,
+    dataReferenceIndex: 1,
+    channelcount: 2,
+    samplesize: 16,
+    samplerate: 48000,
+    streamDescriptor: {
+      type: 'esds',
      version: 0,
      flags: new Uint8Array([0, 0, 0]),
-      chunkOffsets: []
+      size: 39,
-    }]
+      esId: 0,
-  }, minf.boxes[2], 'generates a stbl');
+      streamPriority: 0,
+      // these values were hard-coded based on a working audio init segment
+      decoderConfig: {
-  mvex = boxes[0].boxes[2];
+        avgBitrate: 56000,
-  equal(mvex.type, 'mvex', 'generates an mvex type');
+        maxBitrate: 56000,
+        bufferSize: 1536,
+        objectProfileIndication: 64,
+        streamType: 5
+      }
+    }
+  }, 'generated an audio sample');
+};
+validateMvex = function(mvex, options) {
+  options = options || {
+    sampleDegradationPriority: 1
+  };
  deepEqual({
    type: 'mvex',
    size: 40,
@@ -213,17 +276,75 @@ test('generates a moov', function() {
      sampleHasRedundancy: 0,
      samplePaddingValue: 0,
      sampleIsDifferenceSample: true,
-      sampleDegradationPriority: 1
+      sampleDegradationPriority: options.sampleDegradationPriority
    }]
  }, mvex, 'writes a movie extends box');
+};
+test('generates a video moov', function() {
+  var
+    boxes,
+    data = mp4.moov([{
+      id: 7,
+      duration: 100,
+      width: 600,
+      height: 300,
+      type: 'video',
+      profileIdc: 3,
+      levelIdc: 5,
+      profileCompatibility: 7,
+      sps: [new Uint8Array([0, 1, 2]), new Uint8Array([3, 4, 5])],
+      pps: [new Uint8Array([6, 7, 8])]
+    }]);
+  ok(data, 'box is not null');
+  boxes = inspectMp4(data);
+  equal(boxes.length, 1, 'generated a single box');
+  equal(boxes[0].type, 'moov', 'generated a moov type');
+  equal(boxes[0].size, data.byteLength, 'generated size');
+  equal(boxes[0].boxes.length, 3, 'generated three sub boxes');
+  validateMvhd(boxes[0].boxes[0]);
+  validateTrak(boxes[0].boxes[1], {
+    duration: 100,
+    width: 600,
+    height: 300
+  });
+  validateMvex(boxes[0].boxes[2]);
+});
+test('generates an audio moov', function() {
+  var
+    data = mp4.moov([{
+      id: 7,
+      type: 'audio',
+      channelcount: 2,
+      samplerate: 48000,
+      samplesize: 16
+    }]),
+    boxes;
+  ok(data, 'box is not null');
+  boxes = inspectMp4(data);
+  equal(boxes.length, 1, 'generated a single box');
+  equal(boxes[0].type, 'moov', 'generated a moov type');
+  equal(boxes[0].size, data.byteLength, 'generated size');
+  equal(boxes[0].boxes.length, 3, 'generated three sub boxes');
+  validateMvhd(boxes[0].boxes[0]);
+  validateTrak(boxes[0].boxes[1], {
+    type: 'audio',
+    timescale: 48000
+  });
+  validateMvex(boxes[0].boxes[2], {
+    sampleDegradationPriority: 0
+  });
 });
 test('generates a sound hdlr', function() {
  var boxes, hdlr,
    data = mp4.moov([{
      duration:100,
-      width: 600,
-      height: 300,
      type: 'audio'
    }]);
--- a/test/mp4-inspector_test.js
View file @def510a
+++ b/test/mp4-inspector_test.js
View file @def510a
@@ -586,6 +586,75 @@ test('can parse a video stsd', function() {
  }]);
 });
+test('can parse an audio stsd', function() {
+  var data = box('stsd',
+                 0x00,                         // version 0
+                 0x00, 0x00, 0x00,             // flags
+                 0x00, 0x00, 0x00, 0x01,       // entry_count
+                 box('mp4a',
+                     0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00,         // reserved
+                     0x00, 0x01,               // data_reference_index
+                     0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00,   // reserved
+                     0x00, 0x02,               // channelcount
+                     0x00, 0x10,               // samplesize
+                     0x00, 0x00,               // pre_defined
+                     0x00, 0x00,               // reserved
+                     0xbb, 0x80, 0x00, 0x00, // samplerate, fixed-point 16.16
+                     box('esds',
+                         0x00, // version 0
+                         0x00, 0x00, 0x00, // flags
+                         0x03, // tag, ES_DescrTag
+                         0x00, // length
+                         0x00, 0x01, // ES_ID
+                         0x00, // streamDependenceFlag, URL_Flag, reserved, streamPriority
+                         // DecoderConfigDescriptor
+                         0x04, // tag, DecoderConfigDescrTag
+                         0x0d, // length
+                         0x40, // objectProfileIndication, AAC Main
+                         0x15, // streamType, AudioStream. upstream, reserved
+                         0x00, 0x00, 0xff, // bufferSizeDB
+                         0x00, 0x00, 0x00, 0xff, // maxBitrate
+                         0x00, 0x00, 0x00, 0xaa, // avgBitrate
+                         // DecoderSpecificInfo
+                         0x05, // tag, DecoderSpecificInfoTag
+                         0x02, // length
+                         0x11, 0x90, 0x06, 0x01, 0x02))); // decoder specific info
+  deepEqual(videojs.inspectMp4(new Uint8Array(data)), [{
+    version: 0,
+    flags: new Uint8Array([0, 0, 0]),
+    type: 'stsd',
+    size: 91,
+    sampleDescriptions: [{
+      type: 'mp4a',
+      dataReferenceIndex: 1,
+      channelcount: 2,
+      samplesize: 16,
+      samplerate: 48000,
+      size: 75,
+      streamDescriptor: {
+        type: 'esds',
+        version: 0,
+        size: 39,
+        flags: new Uint8Array([0, 0, 0]),
+        esId: 1,
+        streamPriority: 0,
+        decoderConfig: {
+          objectProfileIndication: 0x40,
+          streamType: 0x05,
+          bufferSize: 0xff,
+          maxBitrate: 0xff,
+          avgBitrate: 0xaa
+        }
+      }
+    }]
+  }], 'parsed an audio stsd');
+});
 test('can parse an styp', function() {
  deepEqual(videojs.inspectMp4(new Uint8Array(box('styp',
    0x61, 0x76, 0x63, 0x31, // major brand
@@ -845,6 +914,24 @@ test('can parse a sidx', function(){
            }]);
 });
+test('can parse an smhd', function() {
+  var data = box('smhd',
+                 0x00,             // version
+                 0x00, 0x00, 0x00, // flags
+                 0x00, 0xff,       // balance, fixed-point 8.8
+                 0x00, 0x00);      // reserved
+  deepEqual(videojs.inspectMp4(new Uint8Array(data)),
+            [{
+              type: 'smhd',
+              size: 16,
+              version: 0,
+              flags: new Uint8Array([0, 0, 0]),
+              balance: 0xff / Math.pow(2, 8)
+            }],
+            'parsed an smhd');
+});
 test('can parse a tfdt', function() {
  var data = box('tfdt',
                 0x00, // version
--- a/test/muxer/js/mp4-inspector.js
View file @def510a
+++ b/test/muxer/js/mp4-inspector.js
View file @def510a
@@ -129,6 +129,27 @@ var
        avgBitrate: view.getUint32(8)
      };
    },
+    esds: function(data) {
+      return {
+        version: data[0],
+        flags: new Uint8Array(data.subarray(1, 4)),
+        esId: (data[6] << 8) | data[7],
+        streamPriority: data[8] & 0x1f,
+        decoderConfig: {
+          objectProfileIndication: data[11],
+          streamType: (data[12] >>> 2) & 0x3f,
+          bufferSize: (data[13] << 16) | (data[14] << 8) | data[15],
+          maxBitrate: (data[16] << 24) |
+            (data[17] << 16) |
+            (data[18] <<  8) |
+            data[19],
+          avgBitrate: (data[20] << 24) |
+            (data[21] << 16) |
+            (data[22] <<  8) |
+            data[23]
+        }
+      };
+    },
    ftyp: function(data) {
      var
        view = new DataView(data.buffer, data.byteOffset, data.byteLength),
@@ -247,6 +268,30 @@ var
        boxes: videojs.inspectMp4(data)
      };
    },
+    // codingname, not a first-class box type. stsd entries share the
+    // same format as real boxes so the parsing infrastructure can be
+    // shared
+    mp4a: function(data) {
+      var
+        view = new DataView(data.buffer, data.byteOffset, data.byteLength),
+        result = {
+          // 6 bytes reserved
+          dataReferenceIndex: view.getUint16(6),
+          // 4 + 4 bytes reserved
+          channelcount: view.getUint16(16),
+          samplesize: view.getUint16(18),
+          // 2 bytes pre_defined
+          // 2 bytes reserved
+          samplerate: view.getUint16(24) + (view.getUint16(26) / 65536)
+        };
+      // if there are more bytes to process, assume this is an ISO/IEC
+      // 14496-14 MP4AudioSampleEntry and parse the ESDBox
+      if (data.byteLength > 28) {
+        result.streamDescriptor = videojs.inspectMp4(data.subarray(28))[0];
+      }
+      return result;
+    },
    moof: function(data) {
      return {
        boxes: videojs.inspectMp4(data)
@@ -357,6 +402,13 @@ var
      return result;
    },
+    smhd: function(data) {
+      return {
+        version: data[0],
+        flags: new Uint8Array(data.subarray(1, 4)),
+        balance: data[4] + (data[5] / 256)
+      };
+    },
    stbl: function(data) {
      return {
        boxes: videojs.inspectMp4(data)
--- a/test/muxer/mp4.html
View file @def510a
+++ b/test/muxer/mp4.html
View file @def510a
@@ -181,8 +181,8 @@
    mediaSource.addEventListener('sourceopen', function() {
      var
-        buffer = mediaSource.addSourceBuffer('video/mp4;codecs=avc1.4d400d'),
+        // buffer = mediaSource.addSourceBuffer('video/mp4;codecs=avc1.4d400d');
-        one = false;
+        buffer = mediaSource.addSourceBuffer('audio/mp4;codecs=mp4a.40.2');
      buffer.addEventListener('updatestart', logevent);
      buffer.addEventListener('updateend', logevent);
      buffer.addEventListener('error', logevent);
@@ -211,27 +211,43 @@
        var segment = new Uint8Array(reader.result),
            transmuxer = new videojs.mp2t.Transmuxer(),
            events = [],
+            i = 0,
+            bytesLength = 0,
+            init = false,
            bytes,
            hex = '';
        transmuxer.on('data', function(data) {
-          if (data) {
+          if (data && data.type === 'audio') {
            events.push(data.data);
+            bytesLength += data.data.byteLength;
+            // XXX Media Sources Testing
+            if (!init) {
+              vjsParsed = videojs.inspectMp4(data.data);
+              console.log('appended tmuxed output');
+              window.vjsSourceBuffer.appendBuffer(data.data);
+              init = true;
+            }
          }
        });
        transmuxer.push(segment);
        transmuxer.end();
-        bytes = new Uint8Array(events[0].byteLength + events[1].byteLength);
+        bytes = new Uint8Array(bytesLength);
-        bytes.set(events[0]);
+        i = 0;
-        bytes.set(events[1], events[0].byteLength);
+        while (events.length) {
+          bytes.set(events[0], i);
+          i += events[0].byteLength;
+          events.shift();
+        }
-        vjsParsed = videojs.inspectMp4(bytes);
+        // vjsParsed = videojs.inspectMp4(bytes);
-        console.log('transmuxed', vjsParsed);
+        console.log('transmuxed', videojs.inspectMp4(bytes));
        diffParsed();
        // clear old box info
-        vjsBoxes.innerHTML = stringify(vjsParsed, null, ' ');
+        vjsBoxes.innerHTML = stringify(videojs.inspectMp4(bytes), null, ' ');
        // write out the result
        hex += '<pre>';
@@ -263,8 +279,7 @@
        workingOutput.innerHTML = hex;
        // XXX Media Sources Testing
-        window.vjsSourceBuffer.appendBuffer(bytes);
+        // window.vjsSourceBuffer.appendBuffer(bytes);
-        console.log('appended bytes');
      });
      reader.readAsArrayBuffer(this.files[0]);
    }, false);
--- a/test/muxer/mse-demo.html
View file @def510a
+++ b/test/muxer/mse-demo.html
View file @def510a
@@ -76,27 +76,41 @@
    // setup the media source
    mediaSource.addEventListener('sourceopen', function() {
-      var buffer = mediaSource.addSourceBuffer('video/mp4;codecs=avc1.4d400d'),
+      var videoBuffer = mediaSource.addSourceBuffer('video/mp4;codecs=avc1.4d400d'),
+          audioBuffer = mediaSource.addSourceBuffer('audio/mp4;codecs=mp4a.40.2'),
          transmuxer = new videojs.mp2t.Transmuxer(),
-          segments = [];
+          videoSegments = [],
+          audioSegments = [];
      // expose the machinery for debugging
      window.vjsMediaSource = mediaSource;
-      window.vjsSourceBuffer = buffer;
+      window.vjsSourceBuffer = videoBuffer;
      window.vjsVideo = demo;
      // transmux the MPEG-TS data to BMFF segments
      transmuxer.on('data', function(segment) {
-        segments.push(segment);
+        if (segment.type === 'video') {
+          videoSegments.push(segment);
+        } else {
+          audioSegments.push(segment);
+        }
      });
      transmuxer.push(hazeVideo);
      transmuxer.end();
      // buffer up the video data
-      buffer.appendBuffer(segments.shift().data);
+      videoBuffer.appendBuffer(videoSegments.shift().data);
-      buffer.addEventListener('updateend', function() {
+      videoBuffer.addEventListener('updateend', function() {
-        if (segments.length) {
+        if (videoSegments.length) {
-          buffer.appendBuffer(segments.shift().data);
+          videoBuffer.appendBuffer(videoSegments.shift().data);
+        }
+      });
+      // buffer up the audio data
+      audioBuffer.appendBuffer(audioSegments.shift().data);
+      audioBuffer.addEventListener('updateend', function() {
+        if (audioSegments.length) {
+          audioBuffer.appendBuffer(audioSegments.shift().data);
        }
      });
    });
--- a/test/muxer/player.html
View file @def510a
+++ b/test/muxer/player.html
View file @def510a
@@ -94,8 +94,10 @@
        var onMediaSourceOpen = function() {
          console.log('on media open');
          ms.removeEventListener('sourceopen', onMediaSourceOpen);
-          var sourceBuffer = ms.addSourceBuffer('video/mp4;codecs="avc1.4D400D"');
+          var videoBuffer = ms.addSourceBuffer('video/mp4;codecs="avc1.4D400D"');
-          sourceBuffer.appendBuffer(bytes);  
+          videoBuffer.appendBuffer(bytes);
+          var audioBuffer = ms.addSourceBuffer('audio/mp4;codecs=mp4a.40.2');
        };
        ms.addEventListener('sourceopen', onMediaSourceOpen);
--- a/test/transmuxer_test.js
View file @def510a
+++ b/test/transmuxer_test.js
View file @def510a
@@ -47,7 +47,9 @@ var
  validateTrack,
  validateTrackFragment,
-  videoPes;
+  transportPacket,
+  videoPes,
+  audioPes;
 module('MP2T Packet Stream', {
  setup: function() {
@@ -397,15 +399,22 @@ test('parses an elementary stream packet with a pts and dts', function() {
  equal(2 / 90, packet.dts, 'parsed the dts');
 });
-// helper function to create video PES packets
+/**
-videoPes = function(data, first) {
+ * Helper function to create transport stream PES packets
+ * @param pid {uint8} - the program identifier (PID)
+ * @param data {arraylike} - the payload bytes
+ * @payload first {boolean} - true if this PES should be a payload
+ * unit start
+ */
+transportPacket = function(pid, data, first) {
  var
-    adaptationFieldLength = 188 - data.length - (first ? 18 : 17),
+    adaptationFieldLength = 188 - data.length - (first ? 15 : 14),
+    // transport_packet(), Rec. ITU-T H.222.0, Table 2-2
    result = [
      // sync byte
      0x47,
      // tei:0 pusi:1 tp:0 pid:0 0000 0001 0001
-      0x40, 0x11,
+      0x40, pid,
      // tsc:01 afc:11 cc:0000
      0x70
    ].concat([
@@ -422,6 +431,7 @@ videoPes = function(data, first) {
    result.push(0xff);
  }
+  // PES_packet(), Rec. ITU-T H.222.0, Table 2-21
  result = result.concat([
    // pscp:0000 0000 0000 0000 0000 0001
    0x00, 0x00, 0x01,
@@ -437,14 +447,41 @@ videoPes = function(data, first) {
  if (first) {
    result.push(0x00);
  }
-  result = result.concat([
+  return result.concat(data);
+};
+/**
+ * Helper function to create video PES packets
+ * @param data {arraylike} - the payload bytes
+ * @payload first {boolean} - true if this PES should be a payload
+ * unit start
+ */
+videoPes = function(data, first) {
+  return transportPacket(0x11, [
    // NAL unit start code
    0x00, 0x00, 0x01
-  ].concat(data));
+  ].concat(data), first);
-  return result;
 };
 standalonePes = videoPes([0xaf, 0x01], true);
+/**
+ * Helper function to create audio PES packets
+ * @param data {arraylike} - the payload bytes
+ * @payload first {boolean} - true if this PES should be a payload
+ * unit start
+ */
+audioPes = function(data, first) {
+  var frameLength = data.length + 7;
+  return transportPacket(0x12, [
+    0xff, 0xf1,                            // no CRC
+    0x10,                                  // AAC Main, 44.1KHz
+    0xb0 | ((frameLength & 0x1800) >> 11), // 2 channels
+    (frameLength & 0x7f8) >> 3,
+    ((frameLength & 0x07) << 5) + 7,       // frame length in bytes
+    0x00                                   // one AAC per ADTS frame
+  ].concat(data), first);
+};
 test('parses an elementary stream packet without a pts or dts', function() {
  var packet;
@@ -950,17 +987,24 @@ test('generates AAC frame events from ADTS bytes', function() {
  aacStream.push({
    type: 'audio',
    data: new Uint8Array([
-      0xff, 0xf1, // no CRC
+      0xff, 0xf1,       // no CRC
-      0x00, // AAC Main, 44.1KHz
+      0x10,             // AAC Main, 44.1KHz
-      0xfc, 0x01, 0x20, // frame length 9 bytes
+      0xbc, 0x01, 0x20, // 2 channels, frame length 9 bytes
-      0x00, // one AAC per ADTS frame
+      0x00,             // one AAC per ADTS frame
-      0x12, 0x34, // AAC payload
+      0x12, 0x34,       // AAC payload
-      0x56, 0x78 // extra junk that should be ignored
+      0x56, 0x78        // extra junk that should be ignored
    ])
  });
  equal(frames.length, 1, 'generated one frame');
  deepEqual(frames[0].data, new Uint8Array([0x12, 0x34]), 'extracted AAC frame');
+  equal(frames[0].channelcount, 2, 'parsed channelcount');
+  equal(frames[0].samplerate, 44100, 'parsed samplerate');
+  // Chrome only supports 8, 16, and 32 bit sample sizes. Assuming the
+  // default value of 16 in ISO/IEC 14496-12 AudioSampleEntry is
+  // acceptable.
+  equal(frames[0].samplesize, 16, 'parsed samplesize');
 });
 // not handled: ADTS with CRC
@@ -972,7 +1016,7 @@ module('Transmuxer', {
  }
 });
-test('generates an init segment', function() {
+test('generates a video init segment', function() {
  var segments = [];
  transmuxer.on('data', function(segment) {
    segments.push(segment);
@@ -980,16 +1024,38 @@ test('generates an init segment', function() {
  transmuxer.push(packetize(PAT));
  transmuxer.push(packetize(PMT));
  transmuxer.push(packetize(videoPes([
-    0x07,
+      0x08, 0x01 // pic_parameter_set_rbsp
+  ], true)));
+  transmuxer.push(packetize(videoPes([
+    0x07, // seq_parameter_set_rbsp
    0x27, 0x42, 0xe0, 0x0b,
    0xa9, 0x18, 0x60, 0x9d,
    0x80, 0x53, 0x06, 0x01,
    0x06, 0xb6, 0xc2, 0xb5,
    0xef, 0x7c, 0x04
+  ], false)));
+  transmuxer.end();
+  equal(segments.length, 2, 'generated init and media segments');
+  ok(segments[0].data, 'wrote data in the init segment');
+  equal(segments[0].type, 'video', 'video is the segment type');
+});
+test('generates an audio init segment', function() {
+  var segments = [];
+  transmuxer.on('data', function(segment) {
+    segments.push(segment);
+  });
+  transmuxer.push(packetize(PAT));
+  transmuxer.push(packetize(PMT));
+  transmuxer.push(packetize(audioPes([
+    0x00, 0x01
  ], true)));
  transmuxer.end();
-  equal(segments.length, 1, 'has an init segment');
+  equal(segments.length, 2, 'generated init and media segments');
+  ok(segments[0].data, 'wrote data in the init segment');
+  equal(segments[0].type, 'audio', 'audio is the segment type');
 });
 test('buffers video samples until ended', function() {
@@ -1123,20 +1189,26 @@ validateTrackFragment = function(track, segment, metadata) {
 test('parses an example mp2t file and generates media segments', function() {
  var
-    segments = [],
+    videoSegments = [],
+    audioSegments = [],
    sequenceNumber = window.Infinity,
    i, boxes, mfhd;
  transmuxer.on('data', function(segment) {
-    segments.push(segment);
+    if (segment.type === 'video') {
+      videoSegments.push(segment);
+    } else if (segment.type === 'audio') {
+      audioSegments.push(segment);
+    }
  });
  transmuxer.push(window.bcSegment);
  transmuxer.end();
-  equal(segments.length, 2, 'generated two segments');
+  equal(videoSegments.length, 2, 'generated two video segments');
+  equal(audioSegments.length, 2, 'generated two audio segments');
-  boxes = videojs.inspectMp4(segments[0].data);
+  boxes = videojs.inspectMp4(videoSegments[0].data);
-  equal(boxes.length, 2, 'init segments are composed of two boxes');
+  equal(boxes.length, 2, 'video init segments are composed of two boxes');
  equal(boxes[0].type, 'ftyp', 'the first box is an ftyp');
  equal(boxes[1].type, 'moov', 'the second box is a moov');
  equal(boxes[1].boxes[0].type, 'mvhd', 'generated an mvhd');
@@ -1150,9 +1222,9 @@ test('parses an example mp2t file and generates media segments', function() {
  // });
  // equal(boxes[1].boxes[3].type, 'mvex', 'generated an mvex');
-  boxes = videojs.inspectMp4(segments[1].data);
+  boxes = videojs.inspectMp4(videoSegments[1].data);
-  ok(boxes.length > 0, 'media segments are not empty');
+  ok(boxes.length > 0, 'video media segments are not empty');
-  ok(boxes.length % 2 === 0, 'media segments are composed of pairs of boxes');
+  ok(boxes.length % 2 === 0, 'video media segments are composed of pairs of boxes');
  for (i = 0; i < boxes.length; i += 2) {
    equal(boxes[i].type, 'moof', 'first box is a moof');
    equal(boxes[i].boxes.length, 2, 'the moof has two children');
@@ -1163,7 +1235,7 @@ test('parses an example mp2t file and generates media segments', function() {
    sequenceNumber = mfhd.sequenceNumber;
    equal(boxes[i + 1].type, 'mdat', 'second box is an mdat');
-    validateTrackFragment(boxes[i].boxes[1], segments[1].data, {
+    validateTrackFragment(boxes[i].boxes[1], videoSegments[1].data, {
      trackId: 256,
      width: 388,
      height: 300,