Bug 971641 - Add AudioTrackMetadata and VideoTrackMetadata abstract class. r=cpearce
authorAlfredo Yang <ayang@mozilla.com>
Tue, 18 Mar 2014 08:20:18 -0400
changeset 174041 a6abe0d4e0be64d787308215604efa8ffb51ddb6
parent 174040 cd104d1d2f8bc5399526528a085d220775798fb6
child 174042 b7f5aaff655899c7f4cd041dd4eae936b9c15519
push id5536
push userryanvm@gmail.com
push dateTue, 18 Mar 2014 12:19:45 +0000
treeherderb2g-inbound@3ae395a54630 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerscpearce
bugs971641
milestone31.0a1
Bug 971641 - Add AudioTrackMetadata and VideoTrackMetadata abstract class. r=cpearce
content/media/encoder/EncodedFrameContainer.h
content/media/encoder/OmxTrackEncoder.cpp
content/media/encoder/TrackMetadataBase.h
content/media/encoder/fmp4_muxer/ISOControl.cpp
content/media/encoder/fmp4_muxer/ISOControl.h
content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
content/media/encoder/fmp4_muxer/ISOMediaWriter.h
content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
content/media/encoder/fmp4_muxer/MP4ESDS.cpp
--- a/content/media/encoder/EncodedFrameContainer.h
+++ b/content/media/encoder/EncodedFrameContainer.h
@@ -53,16 +53,18 @@ public:
     OPUS_AUDIO_FRAME, // Opus audio frame
     VORBIS_AUDIO_FRAME,
     AVC_I_FRAME,
     AVC_P_FRAME,
     AVC_B_FRAME,
     AVC_CSD,          // AVC codec specific data
     AAC_AUDIO_FRAME,
     AAC_CSD,          // AAC codec specific data
+    AMR_AUDIO_CSD,
+    AMR_AUDIO_FRAME,
     UNKNOWN           // FrameType not set
   };
   nsresult SwapInFrameData(nsTArray<uint8_t>& aData)
   {
     mFrameData.SwapElements(aData);
     return NS_OK;
   }
   nsresult SwapOutFrameData(nsTArray<uint8_t>& aData)
--- a/content/media/encoder/OmxTrackEncoder.cpp
+++ b/content/media/encoder/OmxTrackEncoder.cpp
@@ -60,20 +60,19 @@ OmxVideoTrackEncoder::GetMetadata()
     }
   }
 
   if (mCanceled || mEncodingComplete) {
     return nullptr;
   }
 
   nsRefPtr<AVCTrackMetadata> meta = new AVCTrackMetadata();
-  meta->Width = mFrameWidth;
-  meta->Height = mFrameHeight;
-  meta->FrameRate = ENCODER_CONFIG_FRAME_RATE;
-  meta->VideoFrequency = 90000; // Hz
+  meta->mWidth = mFrameWidth;
+  meta->mHeight = mFrameHeight;
+  meta->mFrameRate = ENCODER_CONFIG_FRAME_RATE;
   return meta.forget();
 }
 
 nsresult
 OmxVideoTrackEncoder::GetEncodedTrack(EncodedFrameContainer& aData)
 {
   VideoSegment segment;
   {
@@ -186,20 +185,20 @@ OmxAudioTrackEncoder::GetMetadata()
     }
   }
 
   if (mCanceled || mEncodingComplete) {
     return nullptr;
   }
 
   nsRefPtr<AACTrackMetadata> meta = new AACTrackMetadata();
-  meta->Channels = mChannels;
-  meta->SampleRate = mSamplingRate;
-  meta->FrameSize = OMXCodecWrapper::kAACFrameSize;
-  meta->FrameDuration = OMXCodecWrapper::kAACFrameDuration;
+  meta->mChannels = mChannels;
+  meta->mSampleRate = mSamplingRate;
+  meta->mFrameSize = OMXCodecWrapper::kAACFrameSize;
+  meta->mFrameDuration = OMXCodecWrapper::kAACFrameDuration;
 
   return meta.forget();
 }
 
 nsresult
 OmxAudioTrackEncoder::AppendEncodedFrames(EncodedFrameContainer& aContainer)
 {
   nsTArray<uint8_t> frameData;
--- a/content/media/encoder/TrackMetadataBase.h
+++ b/content/media/encoder/TrackMetadataBase.h
@@ -23,10 +23,38 @@ public:
     METADATA_AAC,
     METADATA_UNKNOWN  // Metadata Kind not set
   };
   virtual ~TrackMetadataBase() {}
   // Return the specific metadata kind
   virtual MetadataKind GetKind() const = 0;
 };
 
+// The base class for audio metadata.
+class AudioTrackMetadata : public TrackMetadataBase {
+public:
+  // The duration of each sample set generated by encoder. (counted by samples)
+  // If the duration is variant, this value should return 0.
+  virtual uint32_t GetAudioFrameDuration() = 0;
+  // The size of each sample set generated by encoder. (counted by byte)
+  // If the size is variant, this value should return 0.
+  virtual uint32_t GetAudioFrameSize() = 0;
+  // AudioSampleRate is the number of audio sample per second.
+  virtual uint32_t GetAudioSampleRate() = 0;
+  virtual uint32_t GetAudioChannels() = 0;
+};
+
+// The base class for video metadata.
+class VideoTrackMetadata : public TrackMetadataBase {
+public:
+  virtual uint32_t GetVideoHeight() = 0;
+  virtual uint32_t GetVideoWidth() = 0;
+  // VideoClockRate is the number of samples per second in video frame's
+  // timestamp.
+  // For example, if VideoClockRate is 90k Hz and VideoFrameRate is
+  // 30 fps, each frame's sample duration will be 3000 Hz.
+  virtual uint32_t GetVideoClockRate() = 0;
+  // VideoFrameRate is numner of frames per second.
+  virtual uint32_t GetVideoFrameRate() = 0;
+};
+
 }
 #endif
--- a/content/media/encoder/fmp4_muxer/ISOControl.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOControl.cpp
@@ -10,33 +10,25 @@
 #include "EncodedFrameContainer.h"
 
 namespace mozilla {
 
 // For MP4 creation_time and modification_time offset from January 1, 1904 to
 // January 1, 1970.
 #define iso_time_offset 2082844800
 
-FragmentBuffer::FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration,
-                               TrackMetadataBase* aMetadata)
+FragmentBuffer::FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration)
   : mTrackType(aTrackType)
   , mFragDuration(aFragDuration)
   , mMediaStartTime(0)
   , mFragmentNumber(0)
   , mLastFrameTimeOfLastFragment(0)
   , mEOS(false)
 {
   mFragArray.AppendElement();
-  if (mTrackType == Audio_Track) {
-    nsRefPtr<AACTrackMetadata> audMeta = static_cast<AACTrackMetadata*>(aMetadata);
-    MOZ_ASSERT(audMeta);
-  } else {
-    nsRefPtr<AVCTrackMetadata> vidMeta = static_cast<AVCTrackMetadata*>(aMetadata);
-    MOZ_ASSERT(vidMeta);
-  }
   MOZ_COUNT_CTOR(FragmentBuffer);
 }
 
 FragmentBuffer::~FragmentBuffer()
 {
   MOZ_COUNT_DTOR(FragmentBuffer);
 }
 
@@ -152,82 +144,76 @@ ISOControl::~ISOControl()
 
 uint32_t
 ISOControl::GetNextTrackID()
 {
   return (mMetaArray.Length() + 1);
 }
 
 uint32_t
-ISOControl::GetTrackID(uint32_t aTrackType)
+ISOControl::GetTrackID(TrackMetadataBase::MetadataKind aKind)
 {
-  TrackMetadataBase::MetadataKind kind;
-  if (aTrackType == Audio_Track) {
-    kind = TrackMetadataBase::METADATA_AAC;
-  } else {
-    kind = TrackMetadataBase::METADATA_AVC;
-  }
-
   for (uint32_t i = 0; i < mMetaArray.Length(); i++) {
-    if (mMetaArray[i]->GetKind() == kind) {
+    if (mMetaArray[i]->GetKind() == aKind) {
       return (i + 1);
     }
   }
 
+  // Track ID shouldn't be 0. It must be something wrong here.
+  MOZ_ASSERT(0);
   return 0;
 }
 
 nsresult
 ISOControl::SetMetadata(TrackMetadataBase* aTrackMeta)
 {
   if (aTrackMeta->GetKind() == TrackMetadataBase::METADATA_AAC ||
       aTrackMeta->GetKind() == TrackMetadataBase::METADATA_AVC) {
     mMetaArray.AppendElement(aTrackMeta);
     return NS_OK;
   }
   return NS_ERROR_FAILURE;
 }
 
 nsresult
-ISOControl::GetAudioMetadata(nsRefPtr<AACTrackMetadata>& aAudMeta)
+ISOControl::GetAudioMetadata(nsRefPtr<AudioTrackMetadata>& aAudMeta)
 {
   for (uint32_t i = 0; i < mMetaArray.Length() ; i++) {
     if (mMetaArray[i]->GetKind() == TrackMetadataBase::METADATA_AAC) {
-      aAudMeta = static_cast<AACTrackMetadata*>(mMetaArray[i].get());
+      aAudMeta = static_cast<AudioTrackMetadata*>(mMetaArray[i].get());
       return NS_OK;
     }
   }
   return NS_ERROR_FAILURE;
 }
 
 nsresult
-ISOControl::GetVideoMetadata(nsRefPtr<AVCTrackMetadata>& aVidMeta)
+ISOControl::GetVideoMetadata(nsRefPtr<VideoTrackMetadata>& aVidMeta)
 {
   for (uint32_t i = 0; i < mMetaArray.Length() ; i++) {
     if (mMetaArray[i]->GetKind() == TrackMetadataBase::METADATA_AVC) {
-      aVidMeta = static_cast<AVCTrackMetadata*>(mMetaArray[i].get());
+      aVidMeta = static_cast<VideoTrackMetadata*>(mMetaArray[i].get());
       return NS_OK;
     }
   }
-
   return NS_ERROR_FAILURE;
 }
 
 bool
 ISOControl::HasAudioTrack()
 {
-  nsRefPtr<AACTrackMetadata> audMeta;
+  nsRefPtr<AudioTrackMetadata> audMeta;
   GetAudioMetadata(audMeta);
   return audMeta;
 }
 
 bool
 ISOControl::HasVideoTrack()
 {
-  nsRefPtr<AVCTrackMetadata> vidMeta;
+  nsRefPtr<VideoTrackMetadata> vidMeta;
   GetVideoMetadata(vidMeta);
   return vidMeta;
 }
 
 nsresult
 ISOControl::SetFragment(FragmentBuffer* aFragment)
 {
   if (aFragment->GetType() == Audio_Track) {
--- a/content/media/encoder/fmp4_muxer/ISOControl.h
+++ b/content/media/encoder/fmp4_muxer/ISOControl.h
@@ -24,18 +24,17 @@ class ISOControl;
  * life cycle, when a fragment is formed in ISOControl, Flush() needs to
  * be called to reset it.
  */
 class FragmentBuffer {
 public:
   // aTrackType: it could be Audio_Track or Video_Track.
   // aFragDuration: it is the fragment duration. (microsecond per unit)
   //                Audio and video have the same fragment duration.
-  FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration,
-                 TrackMetadataBase* aMetadata);
+  FragmentBuffer(uint32_t aTrackType, uint32_t aFragDuration);
   ~FragmentBuffer();
 
   // Get samples of first fragment, that will swap all the elements in the
   // mFragArray[0] when aFlush = true, and caller is responsible for drop
   // EncodedFrame reference count.
   nsresult GetFirstFragment(nsTArray<nsRefPtr<EncodedFrame>>& aFragment,
                             bool aFlush = false);
 
@@ -186,21 +185,23 @@ public:
   uint32_t GetCurFragmentNumber() { return mFragNum; }
 
   nsresult SetFragment(FragmentBuffer* aFragment);
   FragmentBuffer* GetFragment(uint32_t aType);
 
   uint32_t GetMuxingType() { return mMuxingType; }
 
   nsresult SetMetadata(TrackMetadataBase* aTrackMeta);
-  nsresult GetAudioMetadata(nsRefPtr<AACTrackMetadata>& aAudMeta);
-  nsresult GetVideoMetadata(nsRefPtr<AVCTrackMetadata>& aVidMeta);
+  nsresult GetAudioMetadata(nsRefPtr<AudioTrackMetadata>& aAudMeta);
+  nsresult GetVideoMetadata(nsRefPtr<VideoTrackMetadata>& aVidMeta);
 
-  // Track ID is the Metadata index in mMetaArray.
-  uint32_t GetTrackID(uint32_t aTrackType);
+  // Track ID is the Metadata index in mMetaArray. It allows only 1 audio
+  // track and 1 video track in this muxer. In this muxer, it is prohibt to have
+  // mutiple audio track or video track in the same file.
+  uint32_t GetTrackID(TrackMetadataBase::MetadataKind aKind);
   uint32_t GetNextTrackID();
 
   bool HasAudioTrack();
   bool HasVideoTrack();
 
 private:
   uint32_t GetBufPos();
   nsresult FlushBuf();
--- a/content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOMediaBoxes.cpp
@@ -159,17 +159,17 @@ TrackRunBox::fillSampleTable()
                      frames.ElementAt(i - 1)->GetTimeStamp();
         // Keep the last frame time of current fagment, it will be used to calculate
         // the first frame duration of next fragment.
         if ((len - 1) == i) {
           frag->SetLastFragmentLastFrameTime(frames.ElementAt(i)->GetTimeStamp());
         }
       }
       sample_info_table[i].sample_duration =
-        frame_time * mMeta.mVidMeta->VideoFrequency / USECS_PER_S;
+        frame_time * mVideoMeta->GetVideoClockRate() / USECS_PER_S;
       table_size += sizeof(uint32_t);
     }
 
     sample_info_table[i].sample_composition_time_offset = 0;
   }
   return table_size;
 }
 
@@ -226,17 +226,16 @@ TrackRunBox::Write()
 TrackRunBox::TrackRunBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("trun"), 0, aFlags, aControl)
   , sample_count(0)
   , data_offset(0)
   , first_sample_flags(0)
   , mAllSampleSize(0)
   , mTrackType(aType)
 {
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(TrackRunBox);
 }
 
 TrackRunBox::~TrackRunBox()
 {
   MOZ_COUNT_DTOR(TrackRunBox);
 }
 
@@ -245,37 +244,39 @@ TrackFragmentHeaderBox::UpdateBaseDataOf
 {
   base_data_offset = aOffset;
   return NS_OK;
 }
 
 nsresult
 TrackFragmentHeaderBox::Generate(uint32_t* aBoxSize)
 {
-  track_ID = mControl->GetTrackID(mTrackType);
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
   size += sizeof(track_ID);
 
   if (flags.to_ulong() & base_data_offset_present) {
     // base_data_offset needs to add size of 'trun', 'tfhd' and
     // header of 'mdat' later.
     base_data_offset = 0;
     size += sizeof(base_data_offset);
   }
   if (flags.to_ulong() & default_sample_duration_present) {
     if (mTrackType == Video_Track) {
-      if (!mMeta.mVidMeta->FrameRate) {
+      if (!mVideoMeta->GetVideoFrameRate()) {
         // 0 means frame rate is variant, so it is wrong to write
         // default_sample_duration.
         MOZ_ASSERT(0);
         default_sample_duration = 0;
       } else {
-        default_sample_duration = mMeta.mVidMeta->VideoFrequency / mMeta.mVidMeta->FrameRate;
+        default_sample_duration = mVideoMeta->GetVideoClockRate() / mVideoMeta->GetVideoFrameRate();
       }
     } else if (mTrackType == Audio_Track) {
-      default_sample_duration = mMeta.mAudMeta->FrameDuration;
+      default_sample_duration = mAudioMeta->GetAudioFrameDuration();
     } else {
       MOZ_ASSERT(0);
       return NS_ERROR_FAILURE;
     }
     size += sizeof(default_sample_duration);
   }
   *aBoxSize = size;
   return NS_OK;
@@ -299,17 +300,16 @@ TrackFragmentHeaderBox::TrackFragmentHea
                                                uint32_t aFlags,
                                                ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("tfhd"), 0, aFlags, aControl)
   , track_ID(0)
   , base_data_offset(0)
   , default_sample_duration(0)
 {
   mTrackType = aType;
-  mMeta.Init(mControl);
   MOZ_COUNT_CTOR(TrackFragmentHeaderBox);
 }
 
 TrackFragmentHeaderBox::~TrackFragmentHeaderBox()
 {
   MOZ_COUNT_DTOR(TrackFragmentHeaderBox);
 }
 
@@ -424,30 +424,32 @@ MovieFragmentBox::Generate(uint32_t* aBo
   }
 
   return NS_OK;
 }
 
 nsresult
 TrackExtendsBox::Generate(uint32_t* aBoxSize)
 {
-  track_ID = mControl->GetTrackID(mTrackType);
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
 
   if (mTrackType == Audio_Track) {
     default_sample_description_index = 1;
-    default_sample_duration = mMeta.mAudMeta->FrameDuration;
-    default_sample_size = mMeta.mAudMeta->FrameSize;
+    default_sample_duration = mAudioMeta->GetAudioFrameDuration();
+    default_sample_size = mAudioMeta->GetAudioFrameSize();
     default_sample_flags = set_sample_flags(1);
   } else if (mTrackType == Video_Track) {
     default_sample_description_index = 1;
     // Video meta data has assigned framerate, it implies that this video's
     // frame rate should be fixed.
-    if (mMeta.mVidMeta->FrameRate) {
+    if (mVideoMeta->GetVideoFrameRate()) {
       default_sample_duration =
-        mMeta.mVidMeta->VideoFrequency / mMeta.mVidMeta->FrameRate;
+        mVideoMeta->GetVideoClockRate() / mVideoMeta->GetVideoFrameRate();
     }
     default_sample_size = 0;
     default_sample_flags = set_sample_flags(0);
   } else {
     MOZ_ASSERT(0);
     return NS_ERROR_FAILURE;
   }
 
@@ -479,33 +481,31 @@ TrackExtendsBox::TrackExtendsBox(uint32_
   : FullBox(NS_LITERAL_CSTRING("trex"), 0, 0, aControl)
   , track_ID(0)
   , default_sample_description_index(0)
   , default_sample_duration(0)
   , default_sample_size(0)
   , default_sample_flags(0)
   , mTrackType(aType)
 {
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(TrackExtendsBox);
 }
 
 TrackExtendsBox::~TrackExtendsBox()
 {
   MOZ_COUNT_DTOR(TrackExtendsBox);
 }
 
 MovieExtendsBox::MovieExtendsBox(ISOControl* aControl)
   : DefaultContainerImpl(NS_LITERAL_CSTRING("mvex"), aControl)
 {
-  mMeta.Init(aControl);
-  if (mMeta.mAudMeta) {
+  if (mAudioMeta) {
     boxes.AppendElement(new TrackExtendsBox(Audio_Track, aControl));
   }
-  if (mMeta.mVidMeta) {
+  if (mVideoMeta) {
     boxes.AppendElement(new TrackExtendsBox(Video_Track, aControl));
   }
   MOZ_COUNT_CTOR(MovieExtendsBox);
 }
 
 MovieExtendsBox::~MovieExtendsBox()
 {
   MOZ_COUNT_DTOR(MovieExtendsBox);
@@ -938,33 +938,32 @@ MediaHeaderBox::MediaHeaderBox(uint32_t 
   , duration(0)
   , pad(0)
   , lang1(0)
   , lang2(0)
   , lang3(0)
   , pre_defined(0)
 {
   mTrackType = aType;
-  mMeta.Init(aControl);
   MOZ_COUNT_CTOR(MediaHeaderBox);
 }
 
 MediaHeaderBox::~MediaHeaderBox()
 {
   MOZ_COUNT_DTOR(MediaHeaderBox);
 }
 
 uint32_t
 MediaHeaderBox::GetTimeScale()
 {
   if (mTrackType == Audio_Track) {
-    return mMeta.mAudMeta->SampleRate;
+    return mAudioMeta->GetAudioSampleRate();
   }
 
-  return mMeta.mVidMeta->VideoFrequency;
+  return mVideoMeta->GetVideoClockRate();
 }
 
 nsresult
 MediaHeaderBox::Generate(uint32_t* aBoxSize)
 {
   creation_time = mControl->GetTime();
   modification_time = mControl->GetTime();
   timescale = GetTimeScale();
@@ -1067,22 +1066,23 @@ MovieHeaderBox::Write()
   mControl->Write(next_track_ID);
 
   return NS_OK;
 }
 
 uint32_t
 MovieHeaderBox::GetTimeScale()
 {
-  if (mMeta.AudioOnly()) {
-    return mMeta.mAudMeta->SampleRate;
+  // Only audio track in container.
+  if (mAudioMeta && !mVideoMeta) {
+    return mAudioMeta->GetAudioSampleRate();
   }
 
   // return video rate
-  return mMeta.mVidMeta->VideoFrequency;
+  return mVideoMeta->GetVideoClockRate();
 }
 
 MovieHeaderBox::~MovieHeaderBox()
 {
   MOZ_COUNT_DTOR(MovieHeaderBox);
 }
 
 MovieHeaderBox::MovieHeaderBox(ISOControl* aControl)
@@ -1091,17 +1091,16 @@ MovieHeaderBox::MovieHeaderBox(ISOContro
   , modification_time(0)
   , timescale(90000)
   , duration(0)
   , rate(0x00010000)
   , volume(0x0100)
   , reserved16(0)
   , next_track_ID(1)
 {
-  mMeta.Init(aControl);
   memcpy(matrix, iso_matrix, sizeof(matrix));
   memset(reserved32, 0, sizeof(reserved32));
   memset(pre_defined, 0, sizeof(pre_defined));
   MOZ_COUNT_CTOR(MovieHeaderBox);
 }
 
 TrackHeaderBox::TrackHeaderBox(uint32_t aType, ISOControl* aControl)
   : FullBox(NS_LITERAL_CSTRING("tkhd"), 0,
@@ -1115,43 +1114,43 @@ TrackHeaderBox::TrackHeaderBox(uint32_t 
   , layer(0)
   , alternate_group(0)
   , volume(0)
   , reserved3(0)
   , width(0)
   , height(0)
 {
   mTrackType = aType;
-  mMeta.Init(aControl);
   memcpy(matrix, iso_matrix, sizeof(matrix));
   memset(reserved2, 0, sizeof(reserved2));
   MOZ_COUNT_CTOR(TrackHeaderBox);
 }
 
 TrackHeaderBox::~TrackHeaderBox()
 {
   MOZ_COUNT_DTOR(TrackHeaderBox);
 }
 
 nsresult
 TrackHeaderBox::Generate(uint32_t* aBoxSize)
 {
   creation_time = mControl->GetTime();
   modification_time = mControl->GetTime();
-  track_ID = (mTrackType == Audio_Track ? mControl->GetTrackID(Audio_Track)
-                                        : mControl->GetTrackID(Video_Track));
+  track_ID = (mTrackType == Audio_Track ?
+                mControl->GetTrackID(mAudioMeta->GetKind()) :
+                mControl->GetTrackID(mVideoMeta->GetKind()));
   // fragmented mp4
   duration = 0;
 
   // volume, audiotrack is always 0x0100 in 14496-12 8.3.2.2
   volume = (mTrackType == Audio_Track ? 0x0100 : 0);
 
   if (mTrackType == Video_Track) {
-    width = mMeta.mVidMeta->Width << 16;
-    height = mMeta.mVidMeta->Height << 16;
+    width = mVideoMeta->GetVideoWidth() << 16;
+    height = mVideoMeta->GetVideoHeight() << 16;
   }
 
   size += sizeof(creation_time) +
           sizeof(modification_time) +
           sizeof(track_ID) +
           sizeof(reserved) +
           sizeof(duration) +
           sizeof(reserved2) +
@@ -1314,24 +1313,16 @@ DefaultContainerImpl::Write()
 
 DefaultContainerImpl::DefaultContainerImpl(const nsACString& aType,
                                            ISOControl* aControl)
   : Box(aType, aControl)
 {
 }
 
 nsresult
-Box::MetaHelper::Init(ISOControl* aControl)
-{
-  aControl->GetAudioMetadata(mAudMeta);
-  aControl->GetVideoMetadata(mVidMeta);
-  return NS_OK;
-}
-
-nsresult
 Box::Write()
 {
   mControl->Write(size);
   mControl->WriteFourCC(boxType.get());
   return NS_OK;
 }
 
 nsresult
@@ -1343,16 +1334,18 @@ Box::Find(const nsACString& aType, nsTAr
   return NS_OK;
 }
 
 Box::Box(const nsACString& aType, ISOControl* aControl)
   : size(8), mControl(aControl)
 {
   MOZ_ASSERT(aType.Length() == 4);
   boxType = aType;
+  aControl->GetAudioMetadata(mAudioMeta);
+  aControl->GetVideoMetadata(mVideoMeta);
 }
 
 FullBox::FullBox(const nsACString& aType, uint8_t aVersion, uint32_t aFlags,
                  ISOControl* aControl)
   : Box(aType, aControl)
 {
   // Cast to uint64_t due to VC2010  bug.
   std::bitset<24> tmp_flags((uint64_t)aFlags);
@@ -1385,17 +1378,16 @@ TrackBox::~TrackBox()
 
 SampleEntryBox::SampleEntryBox(const nsACString& aFormat, ISOControl* aControl)
   : Box(aFormat, aControl)
   , data_reference_index(0)
 {
   data_reference_index = 1; // There is only one data reference in each track.
   size += sizeof(reserved) +
           sizeof(data_reference_index);
-  mMeta.Init(aControl);
   memset(reserved, 0, sizeof(reserved));
 }
 
 nsresult
 SampleEntryBox::Write()
 {
   Box::Write();
   mControl->Write(reserved, sizeof(reserved));
@@ -1421,20 +1413,19 @@ AudioSampleEntry::AudioSampleEntry(const
   : SampleEntryBox(aFormat, aControl)
   , sound_version(0)
   , channels(2)
   , sample_size(16)
   , compressionId(0)
   , packet_size(0)
   , timeScale(0)
 {
-  mMeta.Init(mControl);
   memset(reserved2, 0 , sizeof(reserved2));
-  channels = mMeta.mAudMeta->Channels;
-  timeScale = mMeta.mAudMeta->SampleRate << 16;
+  channels = mAudioMeta->GetAudioChannels();
+  timeScale = mAudioMeta->GetAudioSampleRate() << 16;
 
   size += sizeof(sound_version) +
           sizeof(reserved2) +
           sizeof(sample_size) +
           sizeof(channels) +
           sizeof(packet_size) +
           sizeof(compressionId) +
           sizeof(timeScale);
@@ -1476,18 +1467,18 @@ VisualSampleEntry::VisualSampleEntry(con
   , frame_count(1)
   , depth(video_depth)
   , pre_defined(-1)
 {
   memset(reserved, 0 , sizeof(reserved));
   memset(compressorName, 0 , sizeof(compressorName));
 
   // both fields occupy 16 bits defined in 14496-2 6.2.3.
-  width = mMeta.mVidMeta->Width;
-  height = mMeta.mVidMeta->Height;
+  width = mVideoMeta->GetVideoWidth();
+  height = mVideoMeta->GetVideoHeight();
 
   size += sizeof(reserved) +
           sizeof(width) +
           sizeof(height) +
           sizeof(horizresolution) +
           sizeof(vertresolution) +
           sizeof(reserved2) +
           sizeof(frame_count) +
--- a/content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
+++ b/content/media/encoder/fmp4_muxer/ISOMediaBoxes.h
@@ -21,18 +21,18 @@
 namespace mozilla {
 
 /**
  * track type from spec 8.4.3.3
  */
 #define Audio_Track 0x01
 #define Video_Track 0x02
 
-class AACTrackMetadata;
-class AVCTrackMetadata;
+class AudioTrackMetadata;
+class VideoTrackMetadata;
 class ES_Descriptor;
 class ISOControl;
 
 /**
  * This is the base class for all ISO media format boxes.
  * It provides the fields of box type(four CC) and size.
  * The data members in the beginning of a Box (or its descendants)
  * are the 14496-12 defined member. Other members prefix with 'm'
@@ -48,32 +48,16 @@ protected:
                      // 14496-12 table 1.
 
 public:
   // MuxerOperation methods
   nsresult Write() MOZ_OVERRIDE;
   nsresult Find(const nsACString& aType,
                 nsTArray<nsRefPtr<MuxerOperation>>& aOperations) MOZ_OVERRIDE;
 
-  // A helper class to check box written bytes number; it will compare
-  // the size generated from Box::Generate() and the actually written length in
-  // Box::Write().
-  class MetaHelper {
-  public:
-    nsresult Init(ISOControl* aControl);
-    bool AudioOnly() {
-      if (mAudMeta && !mVidMeta) {
-        return true;
-      }
-      return false;
-    }
-    nsRefPtr<AACTrackMetadata> mAudMeta;
-    nsRefPtr<AVCTrackMetadata> mVidMeta;
-  };
-
   // This helper class will compare the written size in Write() and the size in
   // Generate(). If their are not equal, it will assert.
   class BoxSizeChecker {
   public:
     BoxSizeChecker(ISOControl* aControl, uint32_t aSize);
     ~BoxSizeChecker();
 
     uint32_t ori_size;
@@ -81,16 +65,18 @@ public:
     ISOControl* mControl;
   };
 
 protected:
   Box() MOZ_DELETE;
   Box(const nsACString& aType, ISOControl* aControl);
 
   ISOControl* mControl;
+  nsRefPtr<AudioTrackMetadata> mAudioMeta;
+  nsRefPtr<VideoTrackMetadata> mVideoMeta;
 };
 
 /**
  * FullBox (and its descendants) is the box which contains the 'real' data
  * members. It is the edge in the ISO box structure and it doesn't contain
  * any box.
  *
  * This class is for inherited only, it shouldn't be instanced directly.
@@ -184,19 +170,16 @@ public:
   // MuxerOperation methods
   nsresult Generate(uint32_t* aBoxSize) MOZ_OVERRIDE;
   nsresult Write() MOZ_OVERRIDE;
 
   // MovieHeaderBox methods
   MovieHeaderBox(ISOControl* aControl);
   ~MovieHeaderBox();
   uint32_t GetTimeScale();
-
-protected:
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.4.2 'Media Header Box'
 // Box type: 'mdhd'
 class MediaHeaderBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t creation_time;
@@ -215,17 +198,16 @@ public:
 
   // MediaHeaderBox methods
   MediaHeaderBox(uint32_t aType, ISOControl* aControl);
   ~MediaHeaderBox();
   uint32_t GetTimeScale();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.3.1 'Track Box'
 // Box type: 'trak'
 // TrackBox contains TrackHeaderBox and MediaBox.
 class TrackBox : public DefaultContainerImpl {
 public:
   TrackBox(uint32_t aTrackType, ISOControl* aControl);
@@ -294,17 +276,16 @@ public:
   TrackRunBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl);
   ~TrackRunBox();
 
 protected:
   uint32_t fillSampleTable();
 
   uint32_t mAllSampleSize;
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // tf_flags in TrackFragmentHeaderBox, 14496-12 8.8.7.1.
 #define base_data_offset_present         0x000001
 #define sample_description_index_present 0x000002
 #define default_sample_duration_present  0x000008
 #define default_sample_size_present      0x000010
 #define default_sample_flags_present     0x000020
@@ -328,17 +309,16 @@ public:
   nsresult UpdateBaseDataOffset(uint64_t aOffset); // The offset of the first
                                                    // sample in file.
 
   TrackFragmentHeaderBox(uint32_t aType, uint32_t aFlags, ISOControl* aControl);
   ~TrackFragmentHeaderBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.8.6 'Track Fragment Box'
 // Box type: 'traf'
 // TrackFragmentBox cotains TrackFragmentHeaderBox and TrackRunBox.
 class TrackFragmentBox : public DefaultContainerImpl {
 public:
   TrackFragmentBox(uint32_t aType, ISOControl* aControl);
@@ -399,29 +379,25 @@ public:
   nsresult Write() MOZ_OVERRIDE;
 
   // TrackExtendsBox methods
   TrackExtendsBox(uint32_t aType, ISOControl* aControl);
   ~TrackExtendsBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.8.1 'Movie Extends Box'
 // Box type: 'mvex'
 // MovieExtendsBox contains TrackExtendsBox.
 class MovieExtendsBox : public DefaultContainerImpl {
 public:
   MovieExtendsBox(ISOControl* aControl);
   ~MovieExtendsBox();
-
-protected:
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.7.5 'Chunk Offset Box'
 // Box type: 'stco'
 class ChunkOffsetBox : public FullBox {
 public:
   // ISO BMFF members
   typedef struct {
@@ -522,18 +498,16 @@ public:
   // sampleentrybox methods
   SampleEntryBox(const nsACString& aFormat, ISOControl* aControl);
 
   // MuxerOperation methods
   nsresult Write() MOZ_OVERRIDE;
 
 protected:
   SampleEntryBox() MOZ_DELETE;
-
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.5.2 'Sample Description Box'
 // Box type: 'stsd'
 class SampleDescriptionBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t entry_count;
@@ -759,17 +733,16 @@ public:
   nsresult Write() MOZ_OVERRIDE;
 
   // TrackHeaderBox methods
   TrackHeaderBox(uint32_t aType, ISOControl* aControl);
   ~TrackHeaderBox();
 
 protected:
   uint32_t mTrackType;
-  MetaHelper mMeta;
 };
 
 // 14496-12 8.4.3 'Handler Reference Box'
 // Box type: 'hdlr'
 class HandlerBox : public FullBox {
 public:
   // ISO BMFF members
   uint32_t pre_defined;
--- a/content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
+++ b/content/media/encoder/fmp4_muxer/ISOMediaWriter.cpp
@@ -201,27 +201,23 @@ ISOMediaWriter::GetContainerData(nsTArra
   return NS_OK;
 }
 
 nsresult
 ISOMediaWriter::SetMetadata(TrackMetadataBase* aMetadata)
 {
   if (aMetadata->GetKind() == TrackMetadataBase::METADATA_AAC ) {
     mControl->SetMetadata(aMetadata);
-    mAudioFragmentBuffer = new FragmentBuffer(Audio_Track,
-                                              FRAG_DURATION,
-                                              aMetadata);
+    mAudioFragmentBuffer = new FragmentBuffer(Audio_Track, FRAG_DURATION);
     mControl->SetFragment(mAudioFragmentBuffer);
     return NS_OK;
   }
   if (aMetadata->GetKind() == TrackMetadataBase::METADATA_AVC) {
     mControl->SetMetadata(aMetadata);
-    mVideoFragmentBuffer = new FragmentBuffer(Video_Track,
-                                              FRAG_DURATION,
-                                              aMetadata);
+    mVideoFragmentBuffer = new FragmentBuffer(Video_Track, FRAG_DURATION);
     mControl->SetFragment(mVideoFragmentBuffer);
     return NS_OK;
   }
 
   return NS_ERROR_FAILURE;
 }
 
 }  // namespace mozilla
--- a/content/media/encoder/fmp4_muxer/ISOMediaWriter.h
+++ b/content/media/encoder/fmp4_muxer/ISOMediaWriter.h
@@ -8,18 +8,16 @@
 
 #include "ContainerWriter.h"
 #include "nsIRunnable.h"
 
 namespace mozilla {
 
 class ISOControl;
 class FragmentBuffer;
-class AACTrackMetadata;
-class AVCTrackMetadata;
 class ISOMediaWriterRunnable;
 
 class ISOMediaWriter : public ContainerWriter
 {
 public:
   // Generate an fragmented MP4 stream, ISO/IEC 14496-12.
   // Brand names in 'ftyp' box are 'isom' and 'mp42'.
   const static uint32_t TYPE_FRAG_MP4 = 1 << 0;
--- a/content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
+++ b/content/media/encoder/fmp4_muxer/ISOTrackMetadata.h
@@ -5,47 +5,66 @@
 
 #ifndef ISOTrackMetadata_h_
 #define ISOTrackMetadata_h_
 
 #include "TrackMetadataBase.h"
 
 namespace mozilla {
 
-class AACTrackMetadata : public TrackMetadataBase {
+class AACTrackMetadata : public AudioTrackMetadata {
 public:
-  uint32_t SampleRate;     // From 14496-3 table 1.16, it could be 7350 ~ 96000.
-  uint32_t FrameDuration;  // Audio frame duration based on SampleRate.
-  uint32_t FrameSize;      // Audio frame size, 0 is variant size.
-  uint32_t Channels;       // Channel number, it should be 1 or 2.
+  // AudioTrackMetadata members
+  uint32_t GetAudioFrameDuration() MOZ_OVERRIDE { return mFrameDuration; }
+  uint32_t GetAudioFrameSize() MOZ_OVERRIDE { return mFrameSize; }
+  uint32_t GetAudioSampleRate() MOZ_OVERRIDE { return mSampleRate; }
+  uint32_t GetAudioChannels() MOZ_OVERRIDE { return mChannels; }
 
+  // TrackMetadataBase member
+  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AAC; }
+
+  // AACTrackMetadata members
   AACTrackMetadata()
-    : SampleRate(0)
-    , FrameDuration(0)
-    , FrameSize(0)
-    , Channels(0) {
+    : mSampleRate(0)
+    , mFrameDuration(0)
+    , mFrameSize(0)
+    , mChannels(0) {
     MOZ_COUNT_CTOR(AACTrackMetadata);
   }
   ~AACTrackMetadata() { MOZ_COUNT_DTOR(AACTrackMetadata); }
-  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AAC; }
+
+  uint32_t mSampleRate;     // From 14496-3 table 1.16, it could be 7350 ~ 96000.
+  uint32_t mFrameDuration;  // Audio frame duration based on SampleRate.
+  uint32_t mFrameSize;      // Audio frame size, 0 is variant size.
+  uint32_t mChannels;       // Channel number, it should be 1 or 2.
 };
 
-class AVCTrackMetadata : public TrackMetadataBase {
+// AVC clock rate is 90k Hz.
+#define AVC_CLOCK_RATE 90000
+
+class AVCTrackMetadata : public VideoTrackMetadata {
 public:
-  uint32_t Height;
-  uint32_t Width;
-  uint32_t VideoFrequency;  // for AVC, it should be 90k Hz.
-  uint32_t FrameRate;       // frames per second
+  // VideoTrackMetadata members
+  uint32_t GetVideoHeight() MOZ_OVERRIDE { return mHeight; }
+  uint32_t GetVideoWidth() MOZ_OVERRIDE {return mWidth; }
+  uint32_t GetVideoClockRate() MOZ_OVERRIDE { return AVC_CLOCK_RATE; }
+  uint32_t GetVideoFrameRate() MOZ_OVERRIDE { return mFrameRate; }
 
+  // TrackMetadataBase member
+  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AVC; }
+
+  // AVCTrackMetadata
   AVCTrackMetadata()
-    : Height(0)
-    , Width(0)
-    , VideoFrequency(0)
-    , FrameRate(0) {
+    : mHeight(0)
+    , mWidth(0)
+    , mFrameRate(0) {
     MOZ_COUNT_CTOR(AVCTrackMetadata);
   }
   ~AVCTrackMetadata() { MOZ_COUNT_DTOR(AVCTrackMetadata); }
-  MetadataKind GetKind() const MOZ_OVERRIDE { return METADATA_AVC; }
+
+  uint32_t mHeight;
+  uint32_t mWidth;
+  uint32_t mFrameRate;       // frames per second
 };
 
 }
 
 #endif // ISOTrackMetadata_h_
--- a/content/media/encoder/fmp4_muxer/MP4ESDS.cpp
+++ b/content/media/encoder/fmp4_muxer/MP4ESDS.cpp
@@ -101,18 +101,16 @@ ES_Descriptor::Write()
 }
 
 nsresult
 ES_Descriptor::Generate(uint32_t* aBoxSize)
 {
   nsresult rv;
   //   14496-1 '8.3.4 DecoderConfigDescriptor'
   //   14496-1 '10.2.3 SL Packet Header Configuration'
-  Box::MetaHelper meta;
-  meta.Init(mControl);
   FragmentBuffer* frag = mControl->GetFragment(Audio_Track);
   rv = frag->GetCSD(DecodeSpecificInfo);
   NS_ENSURE_SUCCESS(rv, rv);
 
   length = sizeof(ES_ID) + 1;
   length += DecodeSpecificInfo.Length();
 
   *aBoxSize = sizeof(tag) + sizeof(length) + length;