Record AAC sample offsets
The generated fragmented mp4 for the audio track can now be appended to a source buffer in Chrome 43 without errors but the buffered duration remains zero after the operation completes. All current tests pass. Sample table entries and mdat contents look consistent with an audio-only, fragmented mp4 generated by ffmpeg though some discrepancies exists in track metadata still.
Showing
4 changed files
with
200 additions
and
70 deletions
... | @@ -333,17 +333,18 @@ sdtp = function(track) { | ... | @@ -333,17 +333,18 @@ sdtp = function(track) { |
333 | var | 333 | var |
334 | samples = track.samples || [], | 334 | samples = track.samples || [], |
335 | bytes = new Uint8Array(4 + samples.length), | 335 | bytes = new Uint8Array(4 + samples.length), |
336 | sample, | 336 | flags, |
337 | i; | 337 | i; |
338 | 338 | ||
339 | // leave the full box header (4 bytes) all zero | 339 | // leave the full box header (4 bytes) all zero |
340 | 340 | ||
341 | // write the sample table | 341 | // write the sample table |
342 | for (i = 0; i < samples.length; i++) { | 342 | for (i = 0; i < samples.length; i++) { |
343 | sample = samples[i]; | 343 | flags = samples[i].flags; |
344 | bytes[i + 4] = (sample.flags.dependsOn << 4) | | 344 | |
345 | (sample.flags.isDependedOn << 2) | | 345 | bytes[i + 4] = (flags.dependsOn << 4) | |
346 | (sample.flags.hasRedundancy); | 346 | (flags.isDependedOn << 2) | |
347 | (flags.hasRedundancy); | ||
347 | } | 348 | } |
348 | 349 | ||
349 | return box(types.sdtp, | 350 | return box(types.sdtp, |
... | @@ -508,30 +509,58 @@ tkhd = function(track) { | ... | @@ -508,30 +509,58 @@ tkhd = function(track) { |
508 | return box(types.tkhd, result); | 509 | return box(types.tkhd, result); |
509 | }; | 510 | }; |
510 | 511 | ||
512 | /** | ||
513 | * Generate a track fragment (traf) box. A traf box collects metadata | ||
514 | * about tracks in a movie fragment (moof) box. | ||
515 | */ | ||
511 | traf = function(track) { | 516 | traf = function(track) { |
512 | var sampleDependencyTable = sdtp(track); | 517 | var trackFragmentHeader, trackFragmentDecodeTime, |
518 | trackFragmentRun, sampleDependencyTable, dataOffset; | ||
519 | |||
520 | trackFragmentHeader = box(types.tfhd, new Uint8Array([ | ||
521 | 0x00, // version 0 | ||
522 | 0x00, 0x00, 0x00, // flags | ||
523 | (track.id & 0xFF000000) >> 24, | ||
524 | (track.id & 0xFF0000) >> 16, | ||
525 | (track.id & 0xFF00) >> 8, | ||
526 | (track.id & 0xFF) // track_ID | ||
527 | ])); | ||
528 | |||
529 | trackFragmentDecodeTime = box(types.tfdt, new Uint8Array([ | ||
530 | 0x00, // version 0 | ||
531 | 0x00, 0x00, 0x00, // flags | ||
532 | 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime | ||
533 | ])); | ||
534 | |||
535 | // the data offset specifies the number of bytes from the start of | ||
536 | // the containing moof to the first payload byte of the associated | ||
537 | // mdat | ||
538 | dataOffset = (16 + // tfhd | ||
539 | 16 + // tfdt | ||
540 | 8 + // traf header | ||
541 | 16 + // mfhd | ||
542 | 8 + // moof header | ||
543 | 8); // mdat header | ||
544 | |||
545 | // audio tracks require less metadata | ||
546 | if (track.type === 'audio') { | ||
547 | trackFragmentRun = trun(track, dataOffset); | ||
548 | return box(types.traf, | ||
549 | trackFragmentHeader, | ||
550 | trackFragmentDecodeTime, | ||
551 | trackFragmentRun); | ||
552 | } | ||
553 | |||
554 | // video tracks should contain an independent and disposable samples | ||
555 | // box (sdtp) | ||
556 | // generate one and adjust offsets to match | ||
557 | sampleDependencyTable = sdtp(track); | ||
558 | trackFragmentRun = trun(track, | ||
559 | sampleDependencyTable.length + dataOffset); | ||
513 | return box(types.traf, | 560 | return box(types.traf, |
514 | box(types.tfhd, new Uint8Array([ | 561 | trackFragmentHeader, |
515 | 0x00, // version 0 | 562 | trackFragmentDecodeTime, |
516 | 0x00, 0x00, 0x00, // flags | 563 | trackFragmentRun, |
517 | (track.id & 0xFF000000) >> 24, | ||
518 | (track.id & 0xFF0000) >> 16, | ||
519 | (track.id & 0xFF00) >> 8, | ||
520 | (track.id & 0xFF) // track_ID | ||
521 | ])), | ||
522 | box(types.tfdt, new Uint8Array([ | ||
523 | 0x00, // version 0 | ||
524 | 0x00, 0x00, 0x00, // flags | ||
525 | 0x00, 0x00, 0x00, 0x00 // baseMediaDecodeTime | ||
526 | ])), | ||
527 | trun(track, | ||
528 | sampleDependencyTable.length + | ||
529 | 16 + // tfhd | ||
530 | 16 + // tfdt | ||
531 | 8 + // traf header | ||
532 | 16 + // mfhd | ||
533 | 8 + // moof header | ||
534 | 8), // mdat header | ||
535 | sampleDependencyTable); | 564 | sampleDependencyTable); |
536 | }; | 565 | }; |
537 | 566 | ||
... | @@ -571,51 +600,116 @@ trex = function(track) { | ... | @@ -571,51 +600,116 @@ trex = function(track) { |
571 | return box(types.trex, result); | 600 | return box(types.trex, result); |
572 | }; | 601 | }; |
573 | 602 | ||
574 | trun = function(track, offset) { | 603 | (function() { |
575 | var bytes, samples, sample, i; | 604 | var audioTrun, videoTrun, trunHeader; |
605 | |||
606 | // This method assumes all samples are uniform. That is, if a | ||
607 | // duration is present for the first sample, it will be present for | ||
608 | // all subsequent samples. | ||
609 | // see ISO/IEC 14496-12:2012, Section 8.8.8.1 | ||
610 | trunHeader = function(samples, offset) { | ||
611 | var durationPresent = 0, sizePresent = 0, | ||
612 | flagsPresent = 0, compositionTimeOffset = 0; | ||
613 | |||
614 | // trun flag constants | ||
615 | if (samples.length) { | ||
616 | if (samples[0].duration !== undefined) { | ||
617 | durationPresent = 0x1; | ||
618 | } | ||
619 | if (samples[0].size !== undefined) { | ||
620 | sizePresent = 0x2; | ||
621 | } | ||
622 | if (samples[0].flags !== undefined) { | ||
623 | flagsPresent = 0x4; | ||
624 | } | ||
625 | if (samples[0].compositionTimeOffset !== undefined) { | ||
626 | compositionTimeOffset = 0x8; | ||
627 | } | ||
628 | } | ||
576 | 629 | ||
577 | samples = track.samples || []; | 630 | return [ |
578 | offset += 8 + 12 + (16 * samples.length); | 631 | 0x00, // version 0 |
632 | 0x00, | ||
633 | durationPresent | sizePresent | flagsPresent | compositionTimeOffset, | ||
634 | 0x01, // flags | ||
635 | (samples.length & 0xFF000000) >>> 24, | ||
636 | (samples.length & 0xFF0000) >>> 16, | ||
637 | (samples.length & 0xFF00) >>> 8, | ||
638 | samples.length & 0xFF, // sample_count | ||
639 | (offset & 0xFF000000) >>> 24, | ||
640 | (offset & 0xFF0000) >>> 16, | ||
641 | (offset & 0xFF00) >>> 8, | ||
642 | offset & 0xFF // data_offset | ||
643 | ]; | ||
644 | }; | ||
579 | 645 | ||
580 | bytes = [ | 646 | videoTrun = function(track, offset) { |
581 | 0x00, // version 0 | 647 | var bytes, samples, sample, i; |
582 | 0x00, 0x0f, 0x01, // flags | 648 | |
583 | (samples.length & 0xFF000000) >>> 24, | 649 | samples = track.samples || []; |
584 | (samples.length & 0xFF0000) >>> 16, | 650 | offset += 8 + 12 + (16 * samples.length); |
585 | (samples.length & 0xFF00) >>> 8, | 651 | |
586 | samples.length & 0xFF, // sample_count | 652 | bytes = trunHeader(samples, offset); |
587 | (offset & 0xFF000000) >>> 24, | 653 | |
588 | (offset & 0xFF0000) >>> 16, | 654 | for (i = 0; i < samples.length; i++) { |
589 | (offset & 0xFF00) >>> 8, | 655 | sample = samples[i]; |
590 | offset & 0xFF // data_offset | 656 | bytes = bytes.concat([ |
591 | ]; | 657 | (sample.duration & 0xFF000000) >>> 24, |
658 | (sample.duration & 0xFF0000) >>> 16, | ||
659 | (sample.duration & 0xFF00) >>> 8, | ||
660 | sample.duration & 0xFF, // sample_duration | ||
661 | (sample.size & 0xFF000000) >>> 24, | ||
662 | (sample.size & 0xFF0000) >>> 16, | ||
663 | (sample.size & 0xFF00) >>> 8, | ||
664 | sample.size & 0xFF, // sample_size | ||
665 | (sample.flags.isLeading << 2) | sample.flags.dependsOn, | ||
666 | (sample.flags.isDependedOn << 6) | | ||
667 | (sample.flags.hasRedundancy << 4) | | ||
668 | (sample.flags.paddingValue << 1) | | ||
669 | sample.flags.isNonSyncSample, | ||
670 | sample.flags.degradationPriority & 0xF0 << 8, | ||
671 | sample.flags.degradationPriority & 0x0F, // sample_flags | ||
672 | (sample.compositionTimeOffset & 0xFF000000) >>> 24, | ||
673 | (sample.compositionTimeOffset & 0xFF0000) >>> 16, | ||
674 | (sample.compositionTimeOffset & 0xFF00) >>> 8, | ||
675 | sample.compositionTimeOffset & 0xFF // sample_composition_time_offset | ||
676 | ]); | ||
677 | } | ||
678 | return box(types.trun, new Uint8Array(bytes)); | ||
679 | }; | ||
592 | 680 | ||
593 | for (i = 0; i < samples.length; i++) { | 681 | audioTrun = function(track, offset) { |
594 | sample = samples[i]; | 682 | var bytes, samples, sample, i; |
595 | bytes = bytes.concat([ | 683 | |
596 | (sample.duration & 0xFF000000) >>> 24, | 684 | samples = track.samples || []; |
597 | (sample.duration & 0xFF0000) >>> 16, | 685 | offset += 8 + 12 + (8 * samples.length); |
598 | (sample.duration & 0xFF00) >>> 8, | 686 | |
599 | sample.duration & 0xFF, // sample_duration | 687 | bytes = trunHeader(samples, offset); |
600 | (sample.size & 0xFF000000) >>> 24, | 688 | |
601 | (sample.size & 0xFF0000) >>> 16, | 689 | for (i = 0; i < samples.length; i++) { |
602 | (sample.size & 0xFF00) >>> 8, | 690 | sample = samples[i]; |
603 | sample.size & 0xFF, // sample_size | 691 | bytes = bytes.concat([ |
604 | (sample.flags.isLeading << 2) | sample.flags.dependsOn, | 692 | (sample.duration & 0xFF000000) >>> 24, |
605 | (sample.flags.isDependedOn << 6) | | 693 | (sample.duration & 0xFF0000) >>> 16, |
606 | (sample.flags.hasRedundancy << 4) | | 694 | (sample.duration & 0xFF00) >>> 8, |
607 | (sample.flags.paddingValue << 1) | | 695 | sample.duration & 0xFF, // sample_duration |
608 | sample.flags.isNonSyncSample, | 696 | (sample.size & 0xFF000000) >>> 24, |
609 | sample.flags.degradationPriority & 0xF0 << 8, | 697 | (sample.size & 0xFF0000) >>> 16, |
610 | sample.flags.degradationPriority & 0x0F, // sample_flags | 698 | (sample.size & 0xFF00) >>> 8, |
611 | (sample.compositionTimeOffset & 0xFF000000) >>> 24, | 699 | sample.size & 0xFF]); // sample_size |
612 | (sample.compositionTimeOffset & 0xFF0000) >>> 16, | 700 | } |
613 | (sample.compositionTimeOffset & 0xFF00) >>> 8, | 701 | |
614 | sample.compositionTimeOffset & 0xFF // sample_composition_time_offset | 702 | return box(types.trun, new Uint8Array(bytes)); |
615 | ]); | 703 | }; |
616 | } | 704 | |
617 | return box(types.trun, new Uint8Array(bytes)); | 705 | trun = function(track, offset) { |
618 | }; | 706 | if (track.type === 'audio') { |
707 | return audioTrun(track, offset); | ||
708 | } else { | ||
709 | return videoTrun(track, offset); | ||
710 | } | ||
711 | }; | ||
712 | })(); | ||
619 | 713 | ||
620 | window.videojs.mp4 = { | 714 | window.videojs.mp4 = { |
621 | ftyp: ftyp, | 715 | ftyp: ftyp, | ... | ... |
... | @@ -486,7 +486,8 @@ AudioSegmentStream = function(track) { | ... | @@ -486,7 +486,8 @@ AudioSegmentStream = function(track) { |
486 | // concatenate the audio data to constuct the mdat | 486 | // concatenate the audio data to constuct the mdat |
487 | data = new Uint8Array(aacFramesLength); | 487 | data = new Uint8Array(aacFramesLength); |
488 | track.samples = []; | 488 | track.samples = []; |
489 | while (aacFramesLength.length) { | 489 | i = 0; |
490 | while (aacFrames.length) { | ||
490 | currentFrame = aacFrames[0]; | 491 | currentFrame = aacFrames[0]; |
491 | sample = { | 492 | sample = { |
492 | size: currentFrame.data.byteLength, | 493 | size: currentFrame.data.byteLength, | ... | ... |
... | @@ -518,6 +518,35 @@ test('generates a minimal moof', function() { | ... | @@ -518,6 +518,35 @@ test('generates a minimal moof', function() { |
518 | }, 'wrote the sample data table'); | 518 | }, 'wrote the sample data table'); |
519 | }); | 519 | }); |
520 | 520 | ||
521 | test('generates a moof for audio', function() { | ||
522 | var | ||
523 | data = mp4.moof(7, [{ | ||
524 | id: 17, | ||
525 | type: 'audio', | ||
526 | samples: [{ | ||
527 | duration: 9000, | ||
528 | size: 10 | ||
529 | }, { | ||
530 | duration: 10000, | ||
531 | size: 11 | ||
532 | }] | ||
533 | }]), | ||
534 | moof = videojs.inspectMp4(data), | ||
535 | trun; | ||
536 | |||
537 | deepEqual(moof[0].boxes[1].boxes.length, 3, 'generated three traf children'); | ||
538 | trun = moof[0].boxes[1].boxes[2]; | ||
539 | ok(trun, 'generated a trun'); | ||
540 | deepEqual(trun.dataOffset, data.byteLength + 8, 'calculated the data offset'); | ||
541 | deepEqual(trun.samples, [{ | ||
542 | duration: 9000, | ||
543 | size: 10 | ||
544 | }, { | ||
545 | duration: 10000, | ||
546 | size: 11 | ||
547 | }], 'wrote simple audio samples'); | ||
548 | }); | ||
549 | |||
521 | test('can generate a traf without samples', function() { | 550 | test('can generate a traf without samples', function() { |
522 | var | 551 | var |
523 | data = mp4.moof(8, [{ | 552 | data = mp4.moof(8, [{ | ... | ... |
... | @@ -39,6 +39,12 @@ var | ... | @@ -39,6 +39,12 @@ var |
39 | for (i = 0; i < avcStream.length; i += length) { | 39 | for (i = 0; i < avcStream.length; i += length) { |
40 | length = avcView.getUint32(i); | 40 | length = avcView.getUint32(i); |
41 | i += 4; | 41 | i += 4; |
42 | |||
43 | // bail if this doesn't appear to be an H264 stream | ||
44 | if (length <= 0) { | ||
45 | return; | ||
46 | } | ||
47 | |||
42 | switch(avcStream[i] & 0x1F) { | 48 | switch(avcStream[i] & 0x1F) { |
43 | case 0x01: | 49 | case 0x01: |
44 | result.push('slice_layer_without_partitioning_rbsp'); | 50 | result.push('slice_layer_without_partitioning_rbsp'); | ... | ... |
-
Please register or sign in to post a comment