72a6a135 by David LaPalomento

Record AAC sample offsets

The generated fragmented mp4 for the audio track can now be appended to a source buffer in Chrome 43 without errors but the buffered duration remains zero after the operation completes. All current tests pass. Sample table entries and mdat contents look consistent with an audio-only, fragmented mp4 generated by ffmpeg though some discrepancies exists in track metadata still.
1 parent 1cc97309
...@@ -333,17 +333,18 @@ sdtp = function(track) { ...@@ -333,17 +333,18 @@ sdtp = function(track) {
333 var 333 var
334 samples = track.samples || [], 334 samples = track.samples || [],
335 bytes = new Uint8Array(4 + samples.length), 335 bytes = new Uint8Array(4 + samples.length),
336 sample, 336 flags,
337 i; 337 i;
338 338
339 // leave the full box header (4 bytes) all zero 339 // leave the full box header (4 bytes) all zero
340 340
341 // write the sample table 341 // write the sample table
342 for (i = 0; i < samples.length; i++) { 342 for (i = 0; i < samples.length; i++) {
343 sample = samples[i]; 343 flags = samples[i].flags;
344 bytes[i + 4] = (sample.flags.dependsOn << 4) | 344
345 (sample.flags.isDependedOn << 2) | 345 bytes[i + 4] = (flags.dependsOn << 4) |
346 (sample.flags.hasRedundancy); 346 (flags.isDependedOn << 2) |
347 (flags.hasRedundancy);
347 } 348 }
348 349
349 return box(types.sdtp, 350 return box(types.sdtp,
...@@ -508,30 +509,58 @@ tkhd = function(track) { ...@@ -508,30 +509,58 @@ tkhd = function(track) {
508 return box(types.tkhd, result); 509 return box(types.tkhd, result);
509 }; 510 };
510 511
512 /**
513 * Generate a track fragment (traf) box. A traf box collects metadata
514 * about tracks in a movie fragment (moof) box.
515 */
511 traf = function(track) { 516 traf = function(track) {
512 var sampleDependencyTable = sdtp(track); 517 var trackFragmentHeader, trackFragmentDecodeTime,
518 trackFragmentRun, sampleDependencyTable, dataOffset;
519
520 trackFragmentHeader = box(types.tfhd, new Uint8Array([
521 0x00, // version 0
522 0x00, 0x00, 0x00, // flags
523 (track.id & 0xFF000000) >> 24,
524 (track.id & 0xFF0000) >> 16,
525 (track.id & 0xFF00) >> 8,
526 (track.id & 0xFF) // track_ID
527 ]));
528
529 trackFragmentDecodeTime = box(types.tfdt, new Uint8Array([
530 0x00, // version 0
531 0x00, 0x00, 0x00, // flags
532 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime
533 ]));
534
535 // the data offset specifies the number of bytes from the start of
536 // the containing moof to the first payload byte of the associated
537 // mdat
538 dataOffset = (16 + // tfhd
539 16 + // tfdt
540 8 + // traf header
541 16 + // mfhd
542 8 + // moof header
543 8); // mdat header
544
545 // audio tracks require less metadata
546 if (track.type === 'audio') {
547 trackFragmentRun = trun(track, dataOffset);
548 return box(types.traf,
549 trackFragmentHeader,
550 trackFragmentDecodeTime,
551 trackFragmentRun);
552 }
553
554 // video tracks should contain an independent and disposable samples
555 // box (sdtp)
556 // generate one and adjust offsets to match
557 sampleDependencyTable = sdtp(track);
558 trackFragmentRun = trun(track,
559 sampleDependencyTable.length + dataOffset);
513 return box(types.traf, 560 return box(types.traf,
514 box(types.tfhd, new Uint8Array([ 561 trackFragmentHeader,
515 0x00, // version 0 562 trackFragmentDecodeTime,
516 0x00, 0x00, 0x00, // flags 563 trackFragmentRun,
517 (track.id & 0xFF000000) >> 24,
518 (track.id & 0xFF0000) >> 16,
519 (track.id & 0xFF00) >> 8,
520 (track.id & 0xFF) // track_ID
521 ])),
522 box(types.tfdt, new Uint8Array([
523 0x00, // version 0
524 0x00, 0x00, 0x00, // flags
525 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime
526 ])),
527 trun(track,
528 sampleDependencyTable.length +
529 16 + // tfhd
530 16 + // tfdt
531 8 + // traf header
532 16 + // mfhd
533 8 + // moof header
534 8), // mdat header
535 sampleDependencyTable); 564 sampleDependencyTable);
536 }; 565 };
537 566
...@@ -571,51 +600,116 @@ trex = function(track) { ...@@ -571,51 +600,116 @@ trex = function(track) {
571 return box(types.trex, result); 600 return box(types.trex, result);
572 }; 601 };
573 602
574 trun = function(track, offset) { 603 (function() {
575 var bytes, samples, sample, i; 604 var audioTrun, videoTrun, trunHeader;
605
606 // This method assumes all samples are uniform. That is, if a
607 // duration is present for the first sample, it will be present for
608 // all subsequent samples.
609 // see ISO/IEC 14496-12:2012, Section 8.8.8.1
610 trunHeader = function(samples, offset) {
611 var durationPresent = 0, sizePresent = 0,
612 flagsPresent = 0, compositionTimeOffset = 0;
613
614 // trun flag constants
615 if (samples.length) {
616 if (samples[0].duration !== undefined) {
617 durationPresent = 0x1;
618 }
619 if (samples[0].size !== undefined) {
620 sizePresent = 0x2;
621 }
622 if (samples[0].flags !== undefined) {
623 flagsPresent = 0x4;
624 }
625 if (samples[0].compositionTimeOffset !== undefined) {
626 compositionTimeOffset = 0x8;
627 }
628 }
576 629
577 samples = track.samples || []; 630 return [
578 offset += 8 + 12 + (16 * samples.length); 631 0x00, // version 0
632 0x00,
633 durationPresent | sizePresent | flagsPresent | compositionTimeOffset,
634 0x01, // flags
635 (samples.length & 0xFF000000) >>> 24,
636 (samples.length & 0xFF0000) >>> 16,
637 (samples.length & 0xFF00) >>> 8,
638 samples.length & 0xFF, // sample_count
639 (offset & 0xFF000000) >>> 24,
640 (offset & 0xFF0000) >>> 16,
641 (offset & 0xFF00) >>> 8,
642 offset & 0xFF // data_offset
643 ];
644 };
579 645
580 bytes = [ 646 videoTrun = function(track, offset) {
581 0x00, // version 0 647 var bytes, samples, sample, i;
582 0x00, 0x0f, 0x01, // flags 648
583 (samples.length & 0xFF000000) >>> 24, 649 samples = track.samples || [];
584 (samples.length & 0xFF0000) >>> 16, 650 offset += 8 + 12 + (16 * samples.length);
585 (samples.length & 0xFF00) >>> 8, 651
586 samples.length & 0xFF, // sample_count 652 bytes = trunHeader(samples, offset);
587 (offset & 0xFF000000) >>> 24, 653
588 (offset & 0xFF0000) >>> 16, 654 for (i = 0; i < samples.length; i++) {
589 (offset & 0xFF00) >>> 8, 655 sample = samples[i];
590 offset & 0xFF // data_offset 656 bytes = bytes.concat([
591 ]; 657 (sample.duration & 0xFF000000) >>> 24,
658 (sample.duration & 0xFF0000) >>> 16,
659 (sample.duration & 0xFF00) >>> 8,
660 sample.duration & 0xFF, // sample_duration
661 (sample.size & 0xFF000000) >>> 24,
662 (sample.size & 0xFF0000) >>> 16,
663 (sample.size & 0xFF00) >>> 8,
664 sample.size & 0xFF, // sample_size
665 (sample.flags.isLeading << 2) | sample.flags.dependsOn,
666 (sample.flags.isDependedOn << 6) |
667 (sample.flags.hasRedundancy << 4) |
668 (sample.flags.paddingValue << 1) |
669 sample.flags.isNonSyncSample,
670 sample.flags.degradationPriority & 0xF0 << 8,
671 sample.flags.degradationPriority & 0x0F, // sample_flags
672 (sample.compositionTimeOffset & 0xFF000000) >>> 24,
673 (sample.compositionTimeOffset & 0xFF0000) >>> 16,
674 (sample.compositionTimeOffset & 0xFF00) >>> 8,
675 sample.compositionTimeOffset & 0xFF // sample_composition_time_offset
676 ]);
677 }
678 return box(types.trun, new Uint8Array(bytes));
679 };
592 680
593 for (i = 0; i < samples.length; i++) { 681 audioTrun = function(track, offset) {
594 sample = samples[i]; 682 var bytes, samples, sample, i;
595 bytes = bytes.concat([ 683
596 (sample.duration & 0xFF000000) >>> 24, 684 samples = track.samples || [];
597 (sample.duration & 0xFF0000) >>> 16, 685 offset += 8 + 12 + (8 * samples.length);
598 (sample.duration & 0xFF00) >>> 8, 686
599 sample.duration & 0xFF, // sample_duration 687 bytes = trunHeader(samples, offset);
600 (sample.size & 0xFF000000) >>> 24, 688
601 (sample.size & 0xFF0000) >>> 16, 689 for (i = 0; i < samples.length; i++) {
602 (sample.size & 0xFF00) >>> 8, 690 sample = samples[i];
603 sample.size & 0xFF, // sample_size 691 bytes = bytes.concat([
604 (sample.flags.isLeading << 2) | sample.flags.dependsOn, 692 (sample.duration & 0xFF000000) >>> 24,
605 (sample.flags.isDependedOn << 6) | 693 (sample.duration & 0xFF0000) >>> 16,
606 (sample.flags.hasRedundancy << 4) | 694 (sample.duration & 0xFF00) >>> 8,
607 (sample.flags.paddingValue << 1) | 695 sample.duration & 0xFF, // sample_duration
608 sample.flags.isNonSyncSample, 696 (sample.size & 0xFF000000) >>> 24,
609 sample.flags.degradationPriority & 0xF0 << 8, 697 (sample.size & 0xFF0000) >>> 16,
610 sample.flags.degradationPriority & 0x0F, // sample_flags 698 (sample.size & 0xFF00) >>> 8,
611 (sample.compositionTimeOffset & 0xFF000000) >>> 24, 699 sample.size & 0xFF]); // sample_size
612 (sample.compositionTimeOffset & 0xFF0000) >>> 16, 700 }
613 (sample.compositionTimeOffset & 0xFF00) >>> 8, 701
614 sample.compositionTimeOffset & 0xFF // sample_composition_time_offset 702 return box(types.trun, new Uint8Array(bytes));
615 ]); 703 };
616 } 704
617 return box(types.trun, new Uint8Array(bytes)); 705 trun = function(track, offset) {
618 }; 706 if (track.type === 'audio') {
707 return audioTrun(track, offset);
708 } else {
709 return videoTrun(track, offset);
710 }
711 };
712 })();
619 713
620 window.videojs.mp4 = { 714 window.videojs.mp4 = {
621 ftyp: ftyp, 715 ftyp: ftyp,
......
...@@ -486,7 +486,8 @@ AudioSegmentStream = function(track) { ...@@ -486,7 +486,8 @@ AudioSegmentStream = function(track) {
486 // concatenate the audio data to constuct the mdat 486 // concatenate the audio data to constuct the mdat
487 data = new Uint8Array(aacFramesLength); 487 data = new Uint8Array(aacFramesLength);
488 track.samples = []; 488 track.samples = [];
489 while (aacFramesLength.length) { 489 i = 0;
490 while (aacFrames.length) {
490 currentFrame = aacFrames[0]; 491 currentFrame = aacFrames[0];
491 sample = { 492 sample = {
492 size: currentFrame.data.byteLength, 493 size: currentFrame.data.byteLength,
......
...@@ -518,6 +518,35 @@ test('generates a minimal moof', function() { ...@@ -518,6 +518,35 @@ test('generates a minimal moof', function() {
518 }, 'wrote the sample data table'); 518 }, 'wrote the sample data table');
519 }); 519 });
520 520
521 test('generates a moof for audio', function() {
522 var
523 data = mp4.moof(7, [{
524 id: 17,
525 type: 'audio',
526 samples: [{
527 duration: 9000,
528 size: 10
529 }, {
530 duration: 10000,
531 size: 11
532 }]
533 }]),
534 moof = videojs.inspectMp4(data),
535 trun;
536
537 deepEqual(moof[0].boxes[1].boxes.length, 3, 'generated three traf children');
538 trun = moof[0].boxes[1].boxes[2];
539 ok(trun, 'generated a trun');
540 deepEqual(trun.dataOffset, data.byteLength + 8, 'calculated the data offset');
541 deepEqual(trun.samples, [{
542 duration: 9000,
543 size: 10
544 }, {
545 duration: 10000,
546 size: 11
547 }], 'wrote simple audio samples');
548 });
549
521 test('can generate a traf without samples', function() { 550 test('can generate a traf without samples', function() {
522 var 551 var
523 data = mp4.moof(8, [{ 552 data = mp4.moof(8, [{
......
...@@ -39,6 +39,12 @@ var ...@@ -39,6 +39,12 @@ var
39 for (i = 0; i < avcStream.length; i += length) { 39 for (i = 0; i < avcStream.length; i += length) {
40 length = avcView.getUint32(i); 40 length = avcView.getUint32(i);
41 i += 4; 41 i += 4;
42
43 // bail if this doesn't appear to be an H264 stream
44 if (length <= 0) {
45 return;
46 }
47
42 switch(avcStream[i] & 0x1F) { 48 switch(avcStream[i] & 0x1F) {
43 case 0x01: 49 case 0x01:
44 result.push('slice_layer_without_partitioning_rbsp'); 50 result.push('slice_layer_without_partitioning_rbsp');
......