Bug 971641 - Add AudioTrackMetadata and VideoTrackMetadata abstract class. r=cpearce
authorAlfredo Yang <ayang@mozilla.com>
Tue, 18 Mar 2014 08:20:18 -0400
changeset 192537 a6abe0d4e0be64d787308215604efa8ffb51ddb6
parent 192536 cd104d1d2f8bc5399526528a085d220775798fb6
child 192538 b7f5aaff655899c7f4cd041dd4eae936b9c15519
push id3624
push userasasaki@mozilla.com
push dateMon, 09 Jun 2014 21:49:01 +0000
treeherdermozilla-beta@b1a5da15899a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerscpearce
bugs971641
milestone31.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 971641 - Add AudioTrackMetadata and VideoTrackMetadata abstract class. r=cpearce
content/media/encoder/EncodedFrameContainer.h
content/media/encoder/OmxTrackEncoder.cpp
content/media/encoder/TrackMetadataBase.h
content/media/encoder/fmp4_muxer/ISOControl.cpp
content/media/encoder/fmp4_muxer/ISOControl.h
content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
content/media/encoder/fmp4_muxer/ISOMediaWriter.h
content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
content/media/encoder/fmp4_muxer/MP4ESDS.cpp
--- a/content/media/encoder/EncodedFrameContainer.h
+++ b/content/media/encoder/EncodedFrameContainer.h
@@ -53,16 +53,18 @@ public:
     OPUS_AUDIO_FRAME, // Opus audio frame
     VORBIS_AUDIO_FRAME,
     AVC_I_FRAME,
     AVC_P_FRAME,
     AVC_B_FRAME,
     AVC_CSD,          // AVC codec specific data
     AAC_AUDIO_FRAME,
     AAC_CSD,          // AAC codec specific data
+    AMR_AUDIO_CSD,
+    AMR_AUDIO_FRAME,
     UNKNOWN           // FrameType not set
   };
   nsresult SwapInFrameData(nsTArray<uint8_t>& aData)
   {
     mFrameData.SwapElements(aData);
     return NS_OK;
   }
   nsresult SwapOutFrameData(nsTArray<uint8_t>& aData)
--- a/content/media/encoder/OmxTrackEncoder.cpp
+++ b/content/media/encoder/OmxTrackEncoder.cpp
@@ -60,20 +60,19 @@ OmxVideoTrackEncoder::GetMetadata()
     }
   }
 
   if (mCanceled || mEncodingComplete) {
     return nullptr;
   }
 
   nsRefPtr<AVCTrackMetadata> meta = new AVCTrackMetadata();
-  meta->Width = mFrameWidth;
-  meta->Height = mFrameHeight;
-  meta->FrameRate = ENCODER_CONFIG_FRAME_RATE;
-  meta->VideoFrequency = 90000; // Hz
+  meta->mWidth = mFrameWidth;
+  meta->mHeight = mFrameHeight;
+  meta->mFrameRate = ENCODER_CONFIG_FRAME_RATE;
   return meta.forget();
 }
 
 nsresult
 OmxVideoTrackEncoder::GetEncodedTrack(EncodedFrameContainer& aData)
 {
   VideoSegment segment;
   {
@@ -186,20 +185,20 @@ OmxAudioTrackEncoder::GetMetadata()
     }
   }
 
   if (mCanceled || mEncodingComplete) {
     return nullptr;
   }
 
   nsRefPtr<AACTrackMetadata> meta = new AACTrackMetadata();
-  meta->Channels = mChannels;
-  meta->SampleRate = mSamplingRate;
-  meta->FrameSize = OMXCodecWrapper::kAACFrameSize;
-  meta->FrameDuration = OMXCodecWrapper::kAACFrameDuration;
+  meta->mChannels = mChannels;
+  meta->mSampleRate = mSamplingRate;
+  meta->mFrameSize = OMXCodecWrapper::kAACFrameSize;
+  meta->mFrameDuration = OMXCodecWrapper::kAACFrameDuration;
 
   return meta.forget();
 }
 
 nsresult
 OmxAudioTrackEncoder::AppendEncodedFrames(EncodedFrameContainer& aContainer)
 {
   nsTArray<uint8_t> frameData;
--- a/content/media/encoder/TrackMetadataBase.h
+++ b/content/media/encoder/TrackMetadataBase.h
@@ -23,10 +23,38 @@ public:
     METADATA_AAC,
     METADATA_UNKNOWN  // Metadata Kind not set
   };
   virtual ~TrackMetadataBase() {}
   // Return the specific metadata kind
   virtual MetadataKind GetKind() const = 0;
 };
 
+// The base class for audio metadata.
+class AudioTrackMetadata : public TrackMetadataBase {
+public:
+  // The duration of each sample set generated by encoder. (counted by samples)
+  // If the duration is variant, this value should return 0.
+  virtual uint32_t GetAudioFrameDuration() = 0;
+  // The size of each sample set generated by encoder. (counted by byte)
+  // If the size is variant, this value should return 0.
+  virtual uint32_t GetAudioFrameSize() = 0;
+  // AudioSampleRate is the number of audio sample per second.
+  virtual uint32_t GetAudioSampleRate() = 0;
+  virtual uint32_t GetAudioChannels() = 0;
+};
+
+// The base class for video metadata.
+class VideoTrackMetadata : public TrackMetadataBase {
+public:
+  virtual uint32_t GetVideoHeight() = 0;
+  virtual uint32_t GetVideoWidth() = 0;
+  // VideoClockRate is the number of samples per second in video frame's
+  // timestamp.
+  // For example, if VideoClockRate is 90k Hz and VideoFrameRate is
+  // 30 fps, each frame's sample duration will be 3000 Hz.
+  virtual uint32_t GetVideoClockRate() = 0;
+  // VideoFrameRate is numner of frames per second.
+  virtual uint32_t GetVideoFrameRate() = 0;
+};
+
 }
 #endif
--- a/content/media/encoder/fmp4_muxer/ISOControl.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOControl.cpp
@@ -10,33 +10,25 @@
 #include "EncodedFrameContainer.h"
 
 namespace mozilla {
 
 // For MP4 creation_time and modification_time offset from January 1, 1904 to
 // January 1, 1970.
 #define iso_time_offset 2082844800
 
-FragmentBuffer::FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration,
-                               TrackMetadataBase* aMetadata)
+FragmentBuffer::FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration)
   : mTrackType(aTrackType)
   , mFragDuration(aFragDuration)
   , mMediaStartTime(0)
   , mFragmentNumber(0)
   , mLastFrameTimeOfLastFragment(0)
   , mEOS(false)
 {
   mFragArray.AppendElement();
-  if (mTrackType == Audio_Track) {
-    nsRefPtr<AACTrackMetadata> audMeta = static_cast<AACTrackMetadata*>(aMetadata);
-    MOZ_ASSERT(audMeta);
-  } else {
-    nsRefPtr<AVCTrackMetadata> vidMeta = static_cast<AVCTrackMetadata*>(aMetadata);
-    MOZ_ASSERT(vidMeta);
-  }
   MOZ_COUNT_CTOR(FragmentBuffer);
 }
 
 FragmentBuffer::~FragmentBuffer()
 {
   MOZ_COUNT_DTOR(FragmentBuffer);
 }
 
@@ -152,82 +144,76 @@ ISOControl::~ISOControl()
 
 uint32_t
 ISOControl::GetNextTrackID()
 {
   return (mMetaArray.Length() + 1);
 }
 
 uint32_t
-ISOControl::GetTrackID(uint32_t aTrackType)
+ISOControl::GetTrackID(TrackMetadataBase::MetadataKind aKind)
 {
-  TrackMetadataBase::MetadataKind kind;
-  if (aTrackType == Audio_Track) {
-    kind = TrackMetadataBase::METADATA_AAC;
-  } else {
-    kind = TrackMetadataBase::METADATA_AVC;
-  }
-
   for (uint32_t i = 0; i < mMetaArray.Length(); i++) {
-    if (mMetaArray[i]->GetKind() == kind) {
+    if (mMetaArray[i]->GetKind() == aKind) {
       return (i + 1);
     }
   }
 
+  // Track ID shouldn't be 0. It must be something wrong here.
+  MOZ_ASSERT(0);
   return 0;
 }
 
 nsresult
 ISOControl::SetMetadata(TrackMetadataBase* aTrackMeta)
 {
   if (aTrackMeta->GetKind() == TrackMetadataBase::METADATA_AAC ||
       aTrackMeta->GetKind() == TrackMetadataBase::METADATA_AVC) {
     mMetaArray.AppendElement(aTrackMeta);
     return NS_OK;
   }
   return NS_ERROR_FAILURE;
 }
 
 nsresult
-ISOControl::GetAudioMetadata(nsRefPtr<AACTrackMetadata>& aAudMeta)
+ISOControl::GetAudioMetadata(nsRefPtr<AudioTrackMetadata>& aAudMeta)
 {
   for (uint32_t i = 0; i < mMetaArray.Length() ; i++) {
     if (mMetaArray[i]->GetKind() == TrackMetadataBase::METADATA_AAC) {
-      aAudMeta = static_cast<AACTrackMetadata*>(mMetaArray[i].get());
+      aAudMeta = static_cast<AudioTrackMetadata*>(mMetaArray[i].get());
       return NS_OK;
     }
   }
   return NS_ERROR_FAILURE;
 }
 
 nsresult
-ISOControl::GetVideoMetadata(nsRefPtr<AVCTrackMetadata>& aVidMeta)
+ISOControl::GetVideoMetadata(nsRefPtr<VideoTrackMetadata>& aVidMeta)
 {
   for (uint32_t i = 0; i < mMetaArray.Length() ; i++) {
     if (mMetaArray[i]->GetKind() == TrackMetadataBase::METADATA_AVC) {
-      aVidMeta = static_cast<AVCTrackMetadata*>(mMetaArray[i].get());
+      aVidMeta = static_cast<VideoTrackMetadata*>(mMetaArray[i].get());
       return NS_OK;
     }
   }
-
   return NS_ERROR_FAILURE;
 }
 
 bool
 ISOControl::HasAudioTrack()
 {
-  nsRefPtr<AACTrackMetadata> audMeta;
+  nsRefPtr<AudioTrackMetadata> audMeta;
   GetAudioMetadata(audMeta);
   return audMeta;
 }
 
 bool
 ISOControl::HasVideoTrack()
 {
-  nsRefPtr<AVCTrackMetadata> vidMeta;
+  nsRefPtr<VideoTrackMetadata> vidMeta;
   GetVideoMetadata(vidMeta);
   return vidMeta;
 }
 
 nsresult
 ISOControl::SetFragment(FragmentBuffer* aFragment)
 {
   if (aFragment->GetType() == Audio_Track) {
--- a/content/media/encoder/fmp4_muxer/ISOControl.h
+++ b/content/media/encoder/fmp4_muxer/ISOControl.h
@@ -24,18 +24,17 @@ class ISOControl;
  * life cycle, when a fragment is formed in ISOControl, Flush() needs to
  * be called to reset it.
  */
 class FragmentBuffer {
 public:
   // aTrackType: it could be Audio_Track or Video_Track.
   // aFragDuration: it is the fragment duration. (microsecond per unit)
   //                Audio and video have the same fragment duration.
-  FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration,
-                 TrackMetadataBase* aMetadata);
+  FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration);
   ~FragmentBuffer();
 
   // Get samples of first fragment, that will swap all the elements in the
   // mFragArray[0] when aFlush = true, and caller is responsible for drop
   // EncodedFrame reference count.
   nsresult GetFirstFragment(nsTArray<nsRefPtr<EncodedFrame>>& aFragment,
                             bool aFlush = false);
 
@@ -186,21 +185,23 @@ public:
   uint32_t GetCurFragmentNumber() { return mFragNum; }
 
   nsresult SetFragment(FragmentBuffer* aFragment);
   FragmentBuffer* GetFragment(uint32_t aType);
 
   uint32_t GetMuxingType() { return mMuxingType; }
 
   nsresult SetMetadata(TrackMetadataBase* aTrackMeta);
-  nsresult GetAudioMetadata(nsRefPtr<AACTrackMetadata>& aAudMeta);
-  nsresult GetVideoMetadata(nsRefPtr<AVCTrackMetadata>& aVidMeta);
+  nsresult GetAudioMetadata(nsRefPtr<AudioTrackMetadata>& aAudMeta);
+  nsresult GetVideoMetadata(nsRefPtr<VideoTrackMetadata>& aVidMeta);
 
-  // Track ID is the Metadata index in mMetaArray.
-  uint32_t GetTrackID(uint32_t aTrackType);
+  // Track ID is the Metadata index in mMetaArray. It allows only 1 audio
+  // track and 1 video track in this muxer. In this muxer, it is prohibt to have
+  // mutiple audio track or video track in the same file.
+  uint32_t GetTrackID(TrackMetadataBase::MetadataKind aKind);
   uint32_t GetNextTrackID();
 
   bool HasAudioTrack();
   bool HasVideoTrack();
 
 private:
   uint32_t GetBufPos();
   nsresult FlushBuf();
--- a/content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
@@ -159,17 +159,17 @@ TrackRunBox::fillSampleTable()
                      frames.ElementAt(i - 1)->GetTimeStamp();
         // Keep the last frame time of current fagment, it will be used to calculate
         // the first frame duration of next fragment.
         if ((len - 1) == i) {
           frag->SetLastFragmentLastFrameTime(frames.ElementAt(i)->GetTimeStamp());
         }
       }
       sample_info_table[i].sample_duration =
-        frame_time * mMeta.mVidMeta->VideoFrequency / USECS_PER_S;
+        frame_time * mVideoMeta->GetVideoClockRate() / USECS_PER_S;
       table_size += sizeof(uint32_t);
     }
 
     sample_info_table[i].sample_composition_time_offset = 0;
   }
   return table_size;
 }
 
@@ -226,17 +226,16 @@ TrackRunBox::Write()
 TrackRunBox::TrackRunBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("trun"), 0, aFlags, aControl)
   , sample_count(0)
   , data_offset(0)
   , first_sample_flags(0)
   , mAllSampleSize(0)
   , mTrackType(aType)
 {
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(TrackRunBox);
 }
 
 TrackRunBox::~TrackRunBox()
 {
   MOZ_COUNT_DTOR(TrackRunBox);
 }
 
@@ -245,37 +244,39 @@ TrackFragmentHeaderBox::UpdateBaseDataOf
 {
   base_data_offset = aOffset;
   return NS_OK;
 }
 
 nsresult
 TrackFragmentHeaderBox::Generate(uint32_t* aBoxSize)
 {
-  track_ID = mControl->GetTrackID(mTrackType);
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
   size += sizeof(track_ID);
 
   if (flags.to_ulong() & base_data_offset_present) {
     // base_data_offset needs to add size of 'trun', 'tfhd' and
     // header of 'mdat' later.
     base_data_offset = 0;
     size += sizeof(base_data_offset);
   }
   if (flags.to_ulong() & default_sample_duration_present) {
     if (mTrackType == Video_Track) {
-      if (!mMeta.mVidMeta->FrameRate) {
+      if (!mVideoMeta->GetVideoFrameRate()) {
         // 0 means frame rate is variant, so it is wrong to write
         // default_sample_duration.
         MOZ_ASSERT(0);
         default_sample_duration = 0;
       } else {
-        default_sample_duration = mMeta.mVidMeta->VideoFrequency / mMeta.mVidMeta->FrameRate;
+        default_sample_duration = mVideoMeta->GetVideoClockRate() / mVideoMeta->GetVideoFrameRate();
       }
     } else if (mTrackType == Audio_Track) {
-      default_sample_duration = mMeta.mAudMeta->FrameDuration;
+      default_sample_duration = mAudioMeta->GetAudioFrameDuration();
     } else {
       MOZ_ASSERT(0);
       return NS_ERROR_FAILURE;
     }
     size += sizeof(default_sample_duration);
   }
   *aBoxSize = size;
   return NS_OK;
@@ -299,17 +300,16 @@ TrackFragmentHeaderBox::TrackFragmentHea
                                                uint32_t aFlags,
                                                ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("tfhd"), 0, aFlags, aControl)
   , track_ID(0)
   , base_data_offset(0)
   , default_sample_duration(0)
 {
   mTrackType = aType;
-  mMeta.Init(mControl);
   MOZ_COUNT_CTOR(TrackFragmentHeaderBox);
 }
 
 TrackFragmentHeaderBox::~TrackFragmentHeaderBox()
 {
   MOZ_COUNT_DTOR(TrackFragmentHeaderBox);
 }
 
@@ -424,30 +424,32 @@ MovieFragmentBox::Generate(uint32_t* aBo
   }
 
   return NS_OK;
 }
 
 nsresult
 TrackExtendsBox::Generate(uint32_t* aBoxSize)
 {
-  track_ID = mControl->GetTrackID(mTrackType);
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
 
   if (mTrackType == Audio_Track) {
     default_sample_description_index = 1;
-    default_sample_duration = mMeta.mAudMeta->FrameDuration;
-    default_sample_size = mMeta.mAudMeta->FrameSize;
+    default_sample_duration = mAudioMeta->GetAudioFrameDuration();
+    default_sample_size = mAudioMeta->GetAudioFrameSize();
     default_sample_flags = set_sample_flags(1);
   } else if (mTrackType == Video_Track) {
     default_sample_description_index = 1;
     // Video meta data has assigned framerate, it implies that this video's
     // frame rate should be fixed.
-    if (mMeta.mVidMeta->FrameRate) {
+    if (mVideoMeta->GetVideoFrameRate()) {
       default_sample_duration =
-        mMeta.mVidMeta->VideoFrequency / mMeta.mVidMeta->FrameRate;
+        mVideoMeta->GetVideoClockRate() / mVideoMeta->GetVideoFrameRate();
     }
     default_sample_size = 0;
     default_sample_flags = set_sample_flags(0);
   } else {
     MOZ_ASSERT(0);
     return NS_ERROR_FAILURE;
   }
 
@@ -479,33 +481,31 @@ TrackExtendsBox::TrackExtendsBox(uint32_
   : FullBox(NS_LITERAL_CSTRING("trex"), 0, 0, aControl)
   , track_ID(0)
   , default_sample_description_index(0)
   , default_sample_duration(0)
   , default_sample_size(0)
   , default_sample_flags(0)
   , mTrackType(aType)
 {
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(TrackExtendsBox);
 }
 
 TrackExtendsBox::~TrackExtendsBox()
 {
   MOZ_COUNT_DTOR(TrackExtendsBox);
 }
 
 MovieExtendsBox::MovieExtendsBox(ISOControl* aControl)
   : DefaultContainerImpl(NS_LITERAL_CSTRING("mvex"), aControl)
 {
-  mMeta.Init(aControl);
-  if (mMeta.mAudMeta) {
+  if (mAudioMeta) {
     boxes.AppendElement(new TrackExtendsBox(Audio_Track, aControl));
   }
-  if (mMeta.mVidMeta) {
+  if (mVideoMeta) {
     boxes.AppendElement(new TrackExtendsBox(Video_Track, aControl));
   }
   MOZ_COUNT_CTOR(MovieExtendsBox);
 }
 
 MovieExtendsBox::~MovieExtendsBox()
 {
   MOZ_COUNT_DTOR(MovieExtendsBox);
@@ -938,33 +938,32 @@ MediaHeaderBox::MediaHeaderBox(uint32_t 
   , duration(0)
   , pad(0)
   , lang1(0)
   , lang2(0)
   , lang3(0)
   , pre_defined(0)
 {
   mTrackType = aType;
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(MediaHeaderBox);
 }
 
 MediaHeaderBox::~MediaHeaderBox()
 {
   MOZ_COUNT_DTOR(MediaHeaderBox);
 }
 
 uint32_t
 MediaHeaderBox::GetTimeScale()
 {
   if (mTrackType == Audio_Track) {
-    return mMeta.mAudMeta->SampleRate;
+    return mAudioMeta->GetAudioSampleRate();
   }
 
-  return mMeta.mVidMeta->VideoFrequency;
+  return mVideoMeta->GetVideoClockRate();
 }
 
 nsresult
 MediaHeaderBox::Generate(uint32_t* aBoxSize)
 {
   creation_time = mControl->GetTime();
   modification_time = mControl->GetTime();
   timescale = GetTimeScale();
@@ -1067,22 +1066,23 @@ MovieHeaderBox::Write()
   mControl->Write(next_track_ID);
 
   return NS_OK;
 }
 
 uint32_t
 MovieHeaderBox::GetTimeScale()
 {
-  if (mMeta.AudioOnly()) {
-    return mMeta.mAudMeta->SampleRate;
+  // Only audio track in container.
+  if (mAudioMeta && !mVideoMeta) {
+    return mAudioMeta->GetAudioSampleRate();
   }
 
   // return video rate
-  return mMeta.mVidMeta->VideoFrequency;
+  return mVideoMeta->GetVideoClockRate();
 }
 
 MovieHeaderBox::~MovieHeaderBox()
 {
   MOZ_COUNT_DTOR(MovieHeaderBox);
 }
 
 MovieHeaderBox::MovieHeaderBox(ISOControl* aControl)
@@ -1091,17 +1091,16 @@ MovieHeaderBox::MovieHeaderBox(ISOContro
   , modification_time(0)
   , timescale(90000)
   , duration(0)
   , rate(0x00010000)
   , volume(0x0100)
   , reserved16(0)
   , next_track_ID(1)
 {
-  mMeta.Init(aControl);
   memcpy(matrix, iso_matrix, sizeof(matrix));
   memset(reserved32, 0, sizeof(reserved32));
   memset(pre_defined, 0, sizeof(pre_defined));
   MOZ_COUNT_CTOR(MovieHeaderBox);
 }
 
 TrackHeaderBox::TrackHeaderBox(uint32_t aType, ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("tkhd"), 0,
@@ -1115,43 +1114,43 @@ TrackHeaderBox::TrackHeaderBox(uint32_t 
   , layer(0)
   , alternate_group(0)
   , volume(0)
   , reserved3(0)
   , width(0)
   , height(0)
 {
   mTrackType = aType;
-  mMeta.Init(aControl);
   memcpy(matrix, iso_matrix, sizeof(matrix));
   memset(reserved2, 0, sizeof(reserved2));
   MOZ_COUNT_CTOR(TrackHeaderBox);
 }
 
 TrackHeaderBox::~TrackHeaderBox()
 {
   MOZ_COUNT_DTOR(TrackHeaderBox);
 }
 
 nsresult
 TrackHeaderBox::Generate(uint32_t* aBoxSize)
 {
   creation_time = mControl->GetTime();
   modification_time = mControl->GetTime();
-  track_ID = (mTrackType == Audio_Track ? mControl->GetTrackID(Audio_Track)
-                                        : mControl->GetTrackID(Video_Track));
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
   // fragmented mp4
   duration = 0;
 
   // volume, audiotrack is always 0x0100 in 14496-12 8.3.2.2
   volume = (mTrackType == Audio_Track ? 0x0100 : 0);
 
   if (mTrackType == Video_Track) {
-    width = mMeta.mVidMeta->Width << 16;
-    height = mMeta.mVidMeta->Height << 16;
+    width = mVideoMeta->GetVideoWidth() << 16;
+    height = mVideoMeta->GetVideoHeight() << 16;
   }
 
   size += sizeof(creation_time) +
           sizeof(modification_time) +
           sizeof(track_ID) +
           sizeof(reserved) +
           sizeof(duration) +
           sizeof(reserved2) +
@@ -1314,24 +1313,16 @@ DefaultContainerImpl::Write()
 
 DefaultContainerImpl::DefaultContainerImpl(const nsACString& aType,
                                            ISOControl* aControl)
   : Box(aType, aControl)
 {
 }
 
 nsresult
-Box::MetaHelper::Init(ISOControl* aControl)
-{
-  aControl->GetAudioMetadata(mAudMeta);
-  aControl->GetVideoMetadata(mVidMeta);
-  return NS_OK;
-}
-
-nsresult
 Box::Write()
 {
   mControl->Write(size);
   mControl->WriteFourCC(boxType.get());
   return NS_OK;
 }
 
 nsresult
@@ -1343,16 +1334,18 @@ Box::Find(const nsACString& aType, nsTAr
   return NS_OK;
 }
 
 Box::Box(const nsACString& aType, ISOControl* aControl)
   : size(8), mControl(aControl)
 {
   MOZ_ASSERT(aType.Length() == 4);
   boxType = aType;
+  aControl->GetAudioMetadata(mAudioMeta);
+  aControl->GetVideoMetadata(mVideoMeta);
 }
 
 FullBox::FullBox(const nsACString& aType, uint8_t aVersion, uint32_t aFlags,
                  ISOControl* aControl)
   : Box(aType, aControl)
 {
   // Cast to uint64_t due to VC2010  bug.
   std::bitset<24> tmp_flags((uint64_t)aFlags);
@@ -1385,17 +1378,16 @@ TrackBox::~TrackBox()
 
 SampleEntryBox::SampleEntryBox(const nsACString& aFormat, ISOControl* aControl)
   : Box(aFormat, aControl)
   , data_reference_index(0)
 {
   data_reference_index = 1; // There is only one data reference in each track.
   size += sizeof(reserved) +
           sizeof(data_reference_index);
-  mMeta.Init(aControl);
   memset(reserved, 0, sizeof(reserved));
 }
 
 nsresult
 SampleEntryBox::Write()
 {
   Box::Write();
   mControl->Write(reserved, sizeof(reserved));
@@ -1421,20 +1413,19 @@ AudioSampleEntry::AudioSampleEntry(const
   : SampleEntryBox(aFormat, aControl)
   , sound_version(0)
   , channels(2)
   , sample_size(16)
   , compressionId(0)
   , packet_size(0)
   , timeScale(0)
 {
-  mMeta.Init(mControl);
   memset(reserved2, 0 , sizeof(reserved2));
-  channels = mMeta.mAudMeta->Channels;
-  timeScale = mMeta.mAudMeta->SampleRate << 16;
+  channels = mAudioMeta->GetAudioChannels();
+  timeScale = mAudioMeta->GetAudioSampleRate() << 16;
 
   size += sizeof(sound_version) +
           sizeof(reserved2) +
           sizeof(sample_size) +
           sizeof(channels) +
           sizeof(packet_size) +
           sizeof(compressionId) +
           sizeof(timeScale);
@@ -1476,18 +1467,18 @@ VisualSampleEntry::VisualSampleEntry(con
   , frame_count(1)
   , depth(video_depth)
   , pre_defined(-1)
 {
   memset(reserved, 0 , sizeof(reserved));
   memset(compressorName, 0 , sizeof(compressorName));
 
   // both fields occupy 16 bits defined in 14496-2 6.2.3.
-  width = mMeta.mVidMeta->Width;
-  height = mMeta.mVidMeta->Height;
+  width = mVideoMeta->GetVideoWidth();
+  height = mVideoMeta->GetVideoHeight();
 
   size += sizeof(reserved) +
           sizeof(width) +
           sizeof(height) +
           sizeof(horizresolution) +
           sizeof(vertresolution) +
           sizeof(reserved2) +
           sizeof(frame_count) +
--- a/content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
+++ b/content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
@@ -21,18 +21,18 @@
 namespace mozilla {
 
 /**
  * track type from spec 8.4.3.3
  */
 #define Audio_Track 0x01
 #define Video_Track 0x02
 
-class AACTrackMetadata;
-class AVCTrackMetadata;
+class AudioTrackMetadata;
+class VideoTrackMetadata;
 class ES_Descriptor;
 class ISOControl;
 
 /**
  * This is the base class for all ISO media format boxes.
  * It provides the fields of box type(four CC) and size.
  * The data members in the beginning of a Box (or its descendants)
  * are the 14496-12 defined member. Other members prefix with 'm'
@@ -48,32 +48,16 @@ protected:
                      // 14496-12 table 1.
 
 public:
   // MuxerOperation methods
   nsresult Write() MOZ_OVERRIDE;
   nsresult Find(const nsACString& aType,
                 nsTArray<nsRefPtr<MuxerOperation>>& aOperations) MOZ_OVERRIDE;
 
-  // A helper class to check box written bytes number; it will compare
-  // the size generated from Box::Generate() and the actually written length in
-  // Box::Write().
-  class MetaHelper {
-  public:
-    nsresult Init(ISOControl* aControl);
-    bool AudioOnly() {
-      if (mAudMeta && !mVidMeta) {
-        return true;
-      }
-      return false;
-    }
-    nsRefPtr<AACTrackMetadata> mAudMeta;
-    nsRefPtr<AVCTrackMetadata> mVidMeta;
-  };
-
   // This helper class will compare the written size in Write() and the size in
   // Generate(). If their are not equal, it will assert.
   class BoxSizeChecker {
   public:
     BoxSizeChecker(ISOControl* aControl, uint32_t aSize);
     ~BoxSizeChecker();
 
     uint32_t ori_size;
@@ -81,16 +65,18 @@ public:
     ISOControl* mControl;
   };
 
 protected:
   Box() MOZ_DELETE;
   Box(const nsACString& aType, ISOControl* aControl);
 
   ISOControl* mControl;
+  nsRefPtr<AudioTrackMetadata> mAudioMeta;
+  nsRefPtr<VideoTrackMetadata> mVideoMeta;
 };
 
 /**
  * FullBox (and its descendants) is the box which contains the 'real' data
  * members. It is the edge in the ISO box structure and it doesn't contain
  * any box.
  *
  * This class is for inherited only, it shouldn't be instanced directly.
@@ -184,19 +170,16 @@ public:
   // MuxerOperation methods
   nsresult Generate(uint32_t* aBoxSize) MOZ_OVERRIDE;
   nsresult Write() MOZ_OVERRIDE;
 
   // MovieHeaderBox methods
   MovieHeaderBox(ISOControl* aControl);
   ~MovieHeaderBox();
   uint32_t GetTimeScale();
-
-protected:
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.4.2 'Media Header Box'
 // Box type: 'mdhd'
 class MediaHeaderBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t creation_time;
@@ -215,17 +198,16 @@ public:
 
   // MediaHeaderBox methods
   MediaHeaderBox(uint32_t aType, ISOControl* aControl);
   ~MediaHeaderBox();
   uint32_t GetTimeScale();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.3.1 'Track Box'
 // Box type: 'trak'
 // TrackBox contains TrackHeaderBox and MediaBox.
 class TrackBox : public DefaultContainerImpl {
 public:
   TrackBox(uint32_t aTrackType, ISOControl* aControl);
@@ -294,17 +276,16 @@ public:
   TrackRunBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl);
   ~TrackRunBox();
 
 protected:
   uint32_t fillSampleTable();
 
   uint32_t mAllSampleSize;
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // tf_flags in TrackFragmentHeaderBox, 14496-12 8.8.7.1.
 #define base_data_offset_present         0x000001
 #define sample_description_index_present 0x000002
 #define default_sample_duration_present  0x000008
 #define default_sample_size_present      0x000010
 #define default_sample_flags_present     0x000020
@@ -328,17 +309,16 @@ public:
   nsresult UpdateBaseDataOffset(uint64_t aOffset); // The offset of the first
                                                    // sample in file.
 
   TrackFragmentHeaderBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl);
   ~TrackFragmentHeaderBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.8.6 'Track Fragment Box'
 // Box type: 'traf'
 // TrackFragmentBox cotains TrackFragmentHeaderBox and TrackRunBox.
 class TrackFragmentBox : public DefaultContainerImpl {
 public:
   TrackFragmentBox(uint32_t aType, ISOControl* aControl);
@@ -399,29 +379,25 @@ public:
   nsresult Write() MOZ_OVERRIDE;
 
   // TrackExtendsBox methods
   TrackExtendsBox(uint32_t aType, ISOControl* aControl);
   ~TrackExtendsBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.8.1 'Movie Extends Box'
 // Box type: 'mvex'
 // MovieExtendsBox contains TrackExtendsBox.
 class MovieExtendsBox : public DefaultContainerImpl {
 public:
   MovieExtendsBox(ISOControl* aControl);
   ~MovieExtendsBox();
-
-protected:
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.7.5 'Chunk Offset Box'
 // Box type: 'stco'
 class ChunkOffsetBox : public FullBox {
 public:
   // ISO BMFF members
   typedef struct {
@@ -522,18 +498,16 @@ public:
   // sampleentrybox methods
   SampleEntryBox(const nsACString& aFormat, ISOControl* aControl);
 
   // MuxerOperation methods
   nsresult Write() MOZ_OVERRIDE;
 
 protected:
   SampleEntryBox() MOZ_DELETE;
-
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.5.2 'Sample Description Box'
 // Box type: 'stsd'
 class SampleDescriptionBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t entry_count;
@@ -759,17 +733,16 @@ public:
   nsresult Write() MOZ_OVERRIDE;
 
   // TrackHeaderBox methods
   TrackHeaderBox(uint32_t aType, ISOControl* aControl);
   ~TrackHeaderBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.4.3 'Handler Reference Box'
 // Box type: 'hdlr'
 class HandlerBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t pre_defined;
--- a/content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
@@ -201,27 +201,23 @@ ISOMediaWriter::GetContainerData(nsTArra
   return NS_OK;
 }
 
 nsresult
 ISOMediaWriter::SetMetadata(TrackMetadataBase* aMetadata)
 {
   if (aMetadata->GetKind() == TrackMetadataBase::METADATA_AAC ) {
     mControl->SetMetadata(aMetadata);
-    mAudioFragmentBuffer = new FragmentBuffer(Audio_Track,
-                                              FRAG_DURATION,
-                                              aMetadata);
+    mAudioFragmentBuffer = new FragmentBuffer(Audio_Track, FRAG_DURATION);
     mControl->SetFragment(mAudioFragmentBuffer);
     return NS_OK;
   }
   if (aMetadata->GetKind() == TrackMetadataBase::METADATA_AVC) {
     mControl->SetMetadata(aMetadata);
-    mVideoFragmentBuffer = new FragmentBuffer(Video_Track,
-                                              FRAG_DURATION,
-                                              aMetadata);
+    mVideoFragmentBuffer = new FragmentBuffer(Video_Track, FRAG_DURATION);
     mControl->SetFragment(mVideoFragmentBuffer);
     return NS_OK;
   }
 
   return NS_ERROR_FAILURE;
 }
 
 }  // namespace mozilla
--- a/content/media/encoder/fmp4_muxer/ISOMediaWriter.h
+++ b/content/media/encoder/fmp4_muxer/ISOMediaWriter.h
@@ -8,18 +8,16 @@
 
 #include "ContainerWriter.h"
 #include "nsIRunnable.h"
 
 namespace mozilla {
 
 class ISOControl;
 class FragmentBuffer;
-class AACTrackMetadata;
-class AVCTrackMetadata;
 class ISOMediaWriterRunnable;
 
 class ISOMediaWriter : public ContainerWriter
 {
 public:
   // Generate an fragmented MP4 stream, ISO/IEC 14496-12.
   // Brand names in 'ftyp' box are 'isom' and 'mp42'.
   const static uint32_t TYPE_FRAG_MP4 = 1 << 0;
--- a/content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
+++ b/content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
@@ -5,47 +5,66 @@
 
 #ifndef ISOTrackMetadata_h_
 #define ISOTrackMetadata_h_
 
 #include "TrackMetadataBase.h"
 
 namespace mozilla {
 
-class AACTrackMetadata : public TrackMetadataBase {
+class AACTrackMetadata : public AudioTrackMetadata {
 public:
-  uint32_t SampleRate;     // From 14496-3 table 1.16, it could be 7350 ~ 96000.
-  uint32_t FrameDuration;  // Audio frame duration based on SampleRate.
-  uint32_t FrameSize;      // Audio frame size, 0 is variant size.
-  uint32_t Channels;       // Channel number, it should be 1 or 2.
+  // AudioTrackMetadata members
+  uint32_t GetAudioFrameDuration() MOZ_OVERRIDE { return mFrameDuration; }
+  uint32_t GetAudioFrameSize() MOZ_OVERRIDE { return mFrameSize; }
+  uint32_t GetAudioSampleRate() MOZ_OVERRIDE { return mSampleRate; }
+  uint32_t GetAudioChannels() MOZ_OVERRIDE { return mChannels; }
 
+  // TrackMetadataBase member
+  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AAC; }
+
+  // AACTrackMetadata members
   AACTrackMetadata()
-    : SampleRate(0)
-    , FrameDuration(0)
-    , FrameSize(0)
-    , Channels(0) {
+    : mSampleRate(0)
+    , mFrameDuration(0)
+    , mFrameSize(0)
+    , mChannels(0) {
     MOZ_COUNT_CTOR(AACTrackMetadata);
   }
   ~AACTrackMetadata() { MOZ_COUNT_DTOR(AACTrackMetadata); }
-  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AAC; }
+
+  uint32_t mSampleRate;     // From 14496-3 table 1.16, it could be 7350 ~ 96000.
+  uint32_t mFrameDuration;  // Audio frame duration based on SampleRate.
+  uint32_t mFrameSize;      // Audio frame size, 0 is variant size.
+  uint32_t mChannels;       // Channel number, it should be 1 or 2.
 };
 
-class AVCTrackMetadata : public TrackMetadataBase {
+// AVC clock rate is 90k Hz.
+#define AVC_CLOCK_RATE 90000
+
+class AVCTrackMetadata : public VideoTrackMetadata {
 public:
-  uint32_t Height;
-  uint32_t Width;
-  uint32_t VideoFrequency;  // for AVC, it should be 90k Hz.
-  uint32_t FrameRate;       // frames per second
+  // VideoTrackMetadata members
+  uint32_t GetVideoHeight() MOZ_OVERRIDE { return mHeight; }
+  uint32_t GetVideoWidth() MOZ_OVERRIDE {return mWidth; }
+  uint32_t GetVideoClockRate() MOZ_OVERRIDE { return AVC_CLOCK_RATE; }
+  uint32_t GetVideoFrameRate() MOZ_OVERRIDE { return mFrameRate; }
 
+  // TrackMetadataBase member
+  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AVC; }
+
+  // AVCTrackMetadata
   AVCTrackMetadata()
-    : Height(0)
-    , Width(0)
-    , VideoFrequency(0)
-    , FrameRate(0) {
+    : mHeight(0)
+    , mWidth(0)
+    , mFrameRate(0) {
     MOZ_COUNT_CTOR(AVCTrackMetadata);
   }
   ~AVCTrackMetadata() { MOZ_COUNT_DTOR(AVCTrackMetadata); }
-  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AVC; }
+
+  uint32_t mHeight;
+  uint32_t mWidth;
+  uint32_t mFrameRate;       // frames per second
 };
 
 }
 
 #endif // ISOTrackMetadata_h_
--- a/content/media/encoder/fmp4_muxer/MP4ESDS.cpp
+++ b/content/media/encoder/fmp4_muxer/MP4ESDS.cpp
@@ -101,18 +101,16 @@ ES_Descriptor::Write()
 }
 
 nsresult
 ES_Descriptor::Generate(uint32_t* aBoxSize)
 {
   nsresult rv;
   //   14496-1 '8.3.4 DecoderConfigDescriptor'
   //   14496-1 '10.2.3 SL Packet Header Configuration'
-  Box::MetaHelper meta;
-  meta.Init(mControl);
   FragmentBuffer* frag = mControl->GetFragment(Audio_Track);
   rv = frag->GetCSD(DecodeSpecificInfo);
   NS_ENSURE_SUCCESS(rv, rv);
 
   length = sizeof(ES_ID) + 1;
   length += DecodeSpecificInfo.Length();
 
   *aBoxSize = sizeof(tag) + sizeof(length) + length;