72a6a135 by David LaPalomento

Record AAC sample offsets

The generated fragmented mp4 for the audio track can now be appended to a source buffer in Chrome 43 without errors but the buffered duration remains zero after the operation completes. All current tests pass. Sample table entries and mdat contents look consistent with an audio-only, fragmented mp4 generated by ffmpeg though some discrepancies exists in track metadata still.
1 parent 1cc97309
...@@ -333,17 +333,18 @@ sdtp = function(track) { ...@@ -333,17 +333,18 @@ sdtp = function(track) {
333 var 333 var
334 samples = track.samples || [], 334 samples = track.samples || [],
335 bytes = new Uint8Array(4 + samples.length), 335 bytes = new Uint8Array(4 + samples.length),
336 sample, 336 flags,
337 i; 337 i;
338 338
339 // leave the full box header (4 bytes) all zero 339 // leave the full box header (4 bytes) all zero
340 340
341 // write the sample table 341 // write the sample table
342 for (i = 0; i < samples.length; i++) { 342 for (i = 0; i < samples.length; i++) {
343 sample = samples[i]; 343 flags = samples[i].flags;
344 bytes[i + 4] = (sample.flags.dependsOn << 4) | 344
345 (sample.flags.isDependedOn << 2) | 345 bytes[i + 4] = (flags.dependsOn << 4) |
346 (sample.flags.hasRedundancy); 346 (flags.isDependedOn << 2) |
347 (flags.hasRedundancy);
347 } 348 }
348 349
349 return box(types.sdtp, 350 return box(types.sdtp,
...@@ -508,30 +509,58 @@ tkhd = function(track) { ...@@ -508,30 +509,58 @@ tkhd = function(track) {
508 return box(types.tkhd, result); 509 return box(types.tkhd, result);
509 }; 510 };
510 511
512 /**
513 * Generate a track fragment (traf) box. A traf box collects metadata
514 * about tracks in a movie fragment (moof) box.
515 */
511 traf = function(track) { 516 traf = function(track) {
512 var sampleDependencyTable = sdtp(track); 517 var trackFragmentHeader, trackFragmentDecodeTime,
513 return box(types.traf, 518 trackFragmentRun, sampleDependencyTable, dataOffset;
514 box(types.tfhd, new Uint8Array([ 519
520 trackFragmentHeader = box(types.tfhd, new Uint8Array([
515 0x00, // version 0 521 0x00, // version 0
516 0x00, 0x00, 0x00, // flags 522 0x00, 0x00, 0x00, // flags
517 (track.id & 0xFF000000) >> 24, 523 (track.id & 0xFF000000) >> 24,
518 (track.id & 0xFF0000) >> 16, 524 (track.id & 0xFF0000) >> 16,
519 (track.id & 0xFF00) >> 8, 525 (track.id & 0xFF00) >> 8,
520 (track.id & 0xFF) // track_ID 526 (track.id & 0xFF) // track_ID
521 ])), 527 ]));
522 box(types.tfdt, new Uint8Array([ 528
529 trackFragmentDecodeTime = box(types.tfdt, new Uint8Array([
523 0x00, // version 0 530 0x00, // version 0
524 0x00, 0x00, 0x00, // flags 531 0x00, 0x00, 0x00, // flags
525 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime 532 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime
526 ])), 533 ]));
527 trun(track, 534
528 sampleDependencyTable.length + 535 // the data offset specifies the number of bytes from the start of
529 16 + // tfhd 536 // the containing moof to the first payload byte of the associated
537 // mdat
538 dataOffset = (16 + // tfhd
530 16 + // tfdt 539 16 + // tfdt
531 8 + // traf header 540 8 + // traf header
532 16 + // mfhd 541 16 + // mfhd
533 8 + // moof header 542 8 + // moof header
534 8), // mdat header 543 8); // mdat header
544
545 // audio tracks require less metadata
546 if (track.type === 'audio') {
547 trackFragmentRun = trun(track, dataOffset);
548 return box(types.traf,
549 trackFragmentHeader,
550 trackFragmentDecodeTime,
551 trackFragmentRun);
552 }
553
554 // video tracks should contain an independent and disposable samples
555 // box (sdtp)
556 // generate one and adjust offsets to match
557 sampleDependencyTable = sdtp(track);
558 trackFragmentRun = trun(track,
559 sampleDependencyTable.length + dataOffset);
560 return box(types.traf,
561 trackFragmentHeader,
562 trackFragmentDecodeTime,
563 trackFragmentRun,
535 sampleDependencyTable); 564 sampleDependencyTable);
536 }; 565 };
537 566
...@@ -571,15 +600,38 @@ trex = function(track) { ...@@ -571,15 +600,38 @@ trex = function(track) {
571 return box(types.trex, result); 600 return box(types.trex, result);
572 }; 601 };
573 602
574 trun = function(track, offset) { 603 (function() {
575 var bytes, samples, sample, i; 604 var audioTrun, videoTrun, trunHeader;
576 605
577 samples = track.samples || []; 606 // This method assumes all samples are uniform. That is, if a
578 offset += 8 + 12 + (16 * samples.length); 607 // duration is present for the first sample, it will be present for
608 // all subsequent samples.
609 // see ISO/IEC 14496-12:2012, Section 8.8.8.1
610 trunHeader = function(samples, offset) {
611 var durationPresent = 0, sizePresent = 0,
612 flagsPresent = 0, compositionTimeOffset = 0;
613
614 // trun flag constants
615 if (samples.length) {
616 if (samples[0].duration !== undefined) {
617 durationPresent = 0x1;
618 }
619 if (samples[0].size !== undefined) {
620 sizePresent = 0x2;
621 }
622 if (samples[0].flags !== undefined) {
623 flagsPresent = 0x4;
624 }
625 if (samples[0].compositionTimeOffset !== undefined) {
626 compositionTimeOffset = 0x8;
627 }
628 }
579 629
580 bytes = [ 630 return [
581 0x00, // version 0 631 0x00, // version 0
582 0x00, 0x0f, 0x01, // flags 632 0x00,
633 durationPresent | sizePresent | flagsPresent | compositionTimeOffset,
634 0x01, // flags
583 (samples.length & 0xFF000000) >>> 24, 635 (samples.length & 0xFF000000) >>> 24,
584 (samples.length & 0xFF0000) >>> 16, 636 (samples.length & 0xFF0000) >>> 16,
585 (samples.length & 0xFF00) >>> 8, 637 (samples.length & 0xFF00) >>> 8,
...@@ -589,6 +641,15 @@ trun = function(track, offset) { ...@@ -589,6 +641,15 @@ trun = function(track, offset) {
589 (offset & 0xFF00) >>> 8, 641 (offset & 0xFF00) >>> 8,
590 offset & 0xFF // data_offset 642 offset & 0xFF // data_offset
591 ]; 643 ];
644 };
645
646 videoTrun = function(track, offset) {
647 var bytes, samples, sample, i;
648
649 samples = track.samples || [];
650 offset += 8 + 12 + (16 * samples.length);
651
652 bytes = trunHeader(samples, offset);
592 653
593 for (i = 0; i < samples.length; i++) { 654 for (i = 0; i < samples.length; i++) {
594 sample = samples[i]; 655 sample = samples[i];
...@@ -615,7 +676,40 @@ trun = function(track, offset) { ...@@ -615,7 +676,40 @@ trun = function(track, offset) {
615 ]); 676 ]);
616 } 677 }
617 return box(types.trun, new Uint8Array(bytes)); 678 return box(types.trun, new Uint8Array(bytes));
618 }; 679 };
680
681 audioTrun = function(track, offset) {
682 var bytes, samples, sample, i;
683
684 samples = track.samples || [];
685 offset += 8 + 12 + (8 * samples.length);
686
687 bytes = trunHeader(samples, offset);
688
689 for (i = 0; i < samples.length; i++) {
690 sample = samples[i];
691 bytes = bytes.concat([
692 (sample.duration & 0xFF000000) >>> 24,
693 (sample.duration & 0xFF0000) >>> 16,
694 (sample.duration & 0xFF00) >>> 8,
695 sample.duration & 0xFF, // sample_duration
696 (sample.size & 0xFF000000) >>> 24,
697 (sample.size & 0xFF0000) >>> 16,
698 (sample.size & 0xFF00) >>> 8,
699 sample.size & 0xFF]); // sample_size
700 }
701
702 return box(types.trun, new Uint8Array(bytes));
703 };
704
705 trun = function(track, offset) {
706 if (track.type === 'audio') {
707 return audioTrun(track, offset);
708 } else {
709 return videoTrun(track, offset);
710 }
711 };
712 })();
619 713
620 window.videojs.mp4 = { 714 window.videojs.mp4 = {
621 ftyp: ftyp, 715 ftyp: ftyp,
......
...@@ -486,7 +486,8 @@ AudioSegmentStream = function(track) { ...@@ -486,7 +486,8 @@ AudioSegmentStream = function(track) {
486 // concatenate the audio data to constuct the mdat 486 // concatenate the audio data to constuct the mdat
487 data = new Uint8Array(aacFramesLength); 487 data = new Uint8Array(aacFramesLength);
488 track.samples = []; 488 track.samples = [];
489 while (aacFramesLength.length) { 489 i = 0;
490 while (aacFrames.length) {
490 currentFrame = aacFrames[0]; 491 currentFrame = aacFrames[0];
491 sample = { 492 sample = {
492 size: currentFrame.data.byteLength, 493 size: currentFrame.data.byteLength,
......
...@@ -518,6 +518,35 @@ test('generates a minimal moof', function() { ...@@ -518,6 +518,35 @@ test('generates a minimal moof', function() {
518 }, 'wrote the sample data table'); 518 }, 'wrote the sample data table');
519 }); 519 });
520 520
521 test('generates a moof for audio', function() {
522 var
523 data = mp4.moof(7, [{
524 id: 17,
525 type: 'audio',
526 samples: [{
527 duration: 9000,
528 size: 10
529 }, {
530 duration: 10000,
531 size: 11
532 }]
533 }]),
534 moof = videojs.inspectMp4(data),
535 trun;
536
537 deepEqual(moof[0].boxes[1].boxes.length, 3, 'generated three traf children');
538 trun = moof[0].boxes[1].boxes[2];
539 ok(trun, 'generated a trun');
540 deepEqual(trun.dataOffset, data.byteLength + 8, 'calculated the data offset');
541 deepEqual(trun.samples, [{
542 duration: 9000,
543 size: 10
544 }, {
545 duration: 10000,
546 size: 11
547 }], 'wrote simple audio samples');
548 });
549
521 test('can generate a traf without samples', function() { 550 test('can generate a traf without samples', function() {
522 var 551 var
523 data = mp4.moof(8, [{ 552 data = mp4.moof(8, [{
......
...@@ -39,6 +39,12 @@ var ...@@ -39,6 +39,12 @@ var
39 for (i = 0; i < avcStream.length; i += length) { 39 for (i = 0; i < avcStream.length; i += length) {
40 length = avcView.getUint32(i); 40 length = avcView.getUint32(i);
41 i += 4; 41 i += 4;
42
43 // bail if this doesn't appear to be an H264 stream
44 if (length <= 0) {
45 return;
46 }
47
42 switch(avcStream[i] & 0x1F) { 48 switch(avcStream[i] & 0x1F) {
43 case 0x01: 49 case 0x01:
44 result.push('slice_layer_without_partitioning_rbsp'); 50 result.push('slice_layer_without_partitioning_rbsp');
......