dom/media/ogg/OggCodecState.h
author Nathan Froyd <froydnj@mozilla.com>
Tue, 20 Nov 2018 10:10:31 -0500
changeset 503704 ce4515855b43514b4ee6e5ef8d450ed2dbfb1a73
parent 503396 0ceae9db9ec0be18daa1a279511ad305723185d4
child 505383 6f3709b3878117466168c40affa7bca0b60cf75b
permissions -rw-r--r--
Bug 1507922 - make ownership of MetadataTags more clear in the ogg code; r=gerald,jya Use UniquePtr for return types, so it's obvious who has ownership.

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#if !defined(OggCodecState_h_)
#define OggCodecState_h_

#include <ogg/ogg.h>
// For MOZ_SAMPLE_TYPE_*
#include "FlacFrameParser.h"
#include "VideoUtils.h"
#include <nsDeque.h>
#include <nsTArray.h>
#include <nsClassHashtable.h>

#include <theora/theoradec.h>
#ifdef MOZ_TREMOR
#include <tremor/ivorbiscodec.h>
#else
#include <vorbis/codec.h>
#endif

// Uncomment the following to validate that we're predicting the number
// of Vorbis samples in each packet correctly.
#define VALIDATE_VORBIS_SAMPLE_CALCULATION
#ifdef VALIDATE_VORBIS_SAMPLE_CALCULATION
#include <map>
#endif

struct OpusMSDecoder;

namespace mozilla {

class OpusParser;

struct OggPacketDeletePolicy {
  void operator()(ogg_packet* aPacket) const {
    delete[] aPacket->packet;
    delete aPacket;
  }
};

using OggPacketPtr = UniquePtr<ogg_packet, OggPacketDeletePolicy>;

// Deallocates a packet, used in OggPacketQueue below.
class OggPacketDeallocator : public nsDequeFunctor {
  virtual void operator()(void* aPacket) override {
    OggPacketDeletePolicy()(static_cast<ogg_packet*>(aPacket));
  }
};

// A queue of ogg_packets. When we read a page, we extract the page's packets
// and buffer them in the owning stream's OggCodecState. This is because
// if we're skipping up to the next keyframe in very large frame sized videos,
// there may be several megabytes of data between keyframes, and the
// ogg_stream_state would end up resizing its buffer every time we added a
// new 4KB page to the bitstream, which kills performance on Windows. This
// also gives us the option to timestamp packets rather than decoded
// frames/samples, reducing the amount of frames/samples we must decode to
// determine start-time at a particular offset, and gives us finer control
// over memory usage.
class OggPacketQueue : private nsDeque {
 public:
  OggPacketQueue() : nsDeque(new OggPacketDeallocator()) {}
  ~OggPacketQueue() { Erase(); }
  bool IsEmpty() { return nsDeque::GetSize() == 0; }
  void Append(OggPacketPtr aPacket);
  OggPacketPtr PopFront() {
    return OggPacketPtr(static_cast<ogg_packet*>(nsDeque::PopFront()));
  }
  ogg_packet* PeekFront() {
    return static_cast<ogg_packet*>(nsDeque::PeekFront());
  }
  OggPacketPtr Pop() {
    return OggPacketPtr(static_cast<ogg_packet*>(nsDeque::Pop()));
  }
  ogg_packet* operator[](size_t aIndex) const {
    return static_cast<ogg_packet*>(nsDeque::ObjectAt(aIndex));
  }
  size_t Length() const { return nsDeque::GetSize(); }
  void PushFront(OggPacketPtr aPacket) {
    nsDeque::PushFront(aPacket.release());
  }
  void Erase() { nsDeque::Erase(); }
};

// Encapsulates the data required for decoding an ogg bitstream and for
// converting granulepos to timestamps.
class OggCodecState {
 public:
  typedef mozilla::MetadataTags MetadataTags;
  // Ogg types we know about
  enum CodecType {
    TYPE_VORBIS = 0,
    TYPE_THEORA,
    TYPE_OPUS,
    TYPE_SKELETON,
    TYPE_FLAC,
    TYPE_UNKNOWN
  };

  virtual ~OggCodecState();

  // Factory for creating nsCodecStates. Use instead of constructor.
  // aPage should be a beginning-of-stream page.
  static OggCodecState* Create(ogg_page* aPage);

  virtual CodecType GetType() { return TYPE_UNKNOWN; }

  // Reads a header packet. Returns false if an error was encountered
  // while reading header packets. Callers should check DoneReadingHeaders()
  // to determine if the last header has been read.
  // This function takes ownership of the packet and is responsible for
  // releasing it or queuing it for later processing.
  virtual bool DecodeHeader(OggPacketPtr aPacket) {
    return (mDoneReadingHeaders = true);
  }

  // Build a hash table with tag metadata parsed from the stream.
  virtual UniquePtr<MetadataTags> GetTags() { return nullptr; }

  // Returns the end time that a granulepos represents.
  virtual int64_t Time(int64_t granulepos) { return -1; }

  // Returns the start time that a granulepos represents.
  virtual int64_t StartTime(int64_t granulepos) { return -1; }

  // Returns the duration of the given packet, if it can be determined.
  virtual int64_t PacketDuration(ogg_packet* aPacket) { return -1; }

  // Returns the start time of the given packet, if it can be determined.
  virtual int64_t PacketStartTime(ogg_packet* aPacket) {
    if (aPacket->granulepos < 0) {
      return -1;
    }
    int64_t endTime = Time(aPacket->granulepos);
    int64_t duration = PacketDuration(aPacket);
    if (duration > endTime) {
      // Audio preskip may eat a whole packet or more.
      return 0;
    } else {
      return endTime - duration;
    }
  }

  // Initializes the codec state.
  virtual bool Init() { return true; }

  // Returns true when this bitstream has finished reading all its
  // header packets.
  bool DoneReadingHeaders() { return mDoneReadingHeaders; }

  // Deactivates the bitstream. Only the primary video and audio bitstreams
  // should be active.
  void Deactivate() {
    mActive = false;
    mDoneReadingHeaders = true;
    Reset();
  }

  // Resets decoding state.
  virtual nsresult Reset();

  // Returns true if the OggCodecState thinks this packet is a header
  // packet. Note this does not verify the validity of the header packet,
  // it just guarantees that the packet is marked as a header packet (i.e.
  // it is definintely not a data packet). Do not use this to identify
  // streams, use it to filter header packets from data packets while
  // decoding.
  virtual bool IsHeader(ogg_packet* aPacket) { return false; }

  // Returns true if the OggCodecState thinks this packet represents a
  // keyframe, from which decoding can restart safely.
  virtual bool IsKeyframe(ogg_packet* aPacket) { return true; }

  // Returns true if there is a packet available for dequeueing in the stream.
  bool IsPacketReady();

  // Returns the next raw packet in the stream, or nullptr if there are no more
  // packets buffered in the packet queue. More packets can be buffered by
  // inserting one or more pages into the stream by calling PageIn().
  // The packet will have a valid granulepos.
  OggPacketPtr PacketOut();

  // Returns the next raw packet in the stream, or nullptr if there are no more
  // packets buffered in the packet queue, without consuming it.
  // The packet will have a valid granulepos.
  ogg_packet* PacketPeek();

  // Moves all raw packets from aOther to the front of the current packet queue.
  void PushFront(OggPacketQueue&& aOther);

  // Returns the next packet in the stream as a MediaRawData, or nullptr
  // if there are no more packets buffered in the packet queue. More packets
  // can be buffered by inserting one or more pages into the stream by calling
  // PageIn(). The packet will have a valid granulepos.
  virtual already_AddRefed<MediaRawData> PacketOutAsMediaRawData();

  // Extracts all packets from the page, and inserts them into the packet
  // queue. They can be extracted by calling PacketOut(). Packets from an
  // inactive stream are not buffered, i.e. this call has no effect for
  // inactive streams. Multiple pages may need to be inserted before
  // PacketOut() starts to return packets, as granulepos may need to be
  // captured.
  virtual nsresult PageIn(ogg_page* aPage);

  // Returns the maximum number of microseconds which a keyframe can be offset
  // from any given interframe.b
  virtual int64_t MaxKeyframeOffset() { return 0; }
  // Public access for mTheoraInfo.keyframe_granule_shift
  virtual int32_t KeyFrameGranuleJobs() { return 0; }

  // Number of packets read.
  uint64_t mPacketCount;

  // Serial number of the bitstream.
  uint32_t mSerial;

  // Ogg specific state.
  ogg_stream_state mState;

  // Queue of as yet undecoded packets. Packets are guaranteed to have
  // a valid granulepos.
  OggPacketQueue mPackets;

  // Is the bitstream active; whether we're decoding and playing this bitstream.
  bool mActive;

  // True when all headers packets have been read.
  bool mDoneReadingHeaders;

  virtual const TrackInfo* GetInfo() const {
    MOZ_RELEASE_ASSERT(false, "Can't be called directly");
    return nullptr;
  }

  // Validation utility for vorbis-style tag names.
  static bool IsValidVorbisTagName(nsCString& aName);

  // Utility method to parse and add a vorbis-style comment
  // to a metadata hash table. Most Ogg-encapsulated codecs
  // use the vorbis comment format for metadata.
  static bool AddVorbisComment(UniquePtr<MetadataTags>& aTags,
                               const char* aComment,
                               uint32_t aLength);

protected:
  // Constructs a new OggCodecState. aActive denotes whether the stream is
  // active. For streams of unsupported or unknown types, aActive should be
  // false.
  OggCodecState(ogg_page* aBosPage, bool aActive);

  // Deallocates all packets stored in mUnstamped, and clears the array.
  void ClearUnstamped();

  // Extracts packets out of mState until a data packet with a non -1
  // granulepos is encountered, or no more packets are readable. Header
  // packets are pushed into the packet queue immediately, and data packets
  // are buffered in mUnstamped. Once a non -1 granulepos packet is read
  // the granulepos of the packets in mUnstamped can be inferred, and they
  // can be pushed over to mPackets. Used by PageIn() implementations in
  // subclasses.
  nsresult PacketOutUntilGranulepos(bool& aFoundGranulepos);

  // Temporary buffer in which to store packets while we're reading packets
  // in order to capture granulepos.
  nsTArray<OggPacketPtr> mUnstamped;

  bool SetCodecSpecificConfig(MediaByteBuffer* aBuffer,
                              OggPacketQueue& aHeaders);

 private:
  bool InternalInit();
};

class VorbisState : public OggCodecState {
 public:
  explicit VorbisState(ogg_page* aBosPage);
  virtual ~VorbisState();

  CodecType GetType() override { return TYPE_VORBIS; }
  bool DecodeHeader(OggPacketPtr aPacket) override;
  int64_t Time(int64_t granulepos) override;
  int64_t PacketDuration(ogg_packet* aPacket) override;
  bool Init() override;
  nsresult Reset() override;
  bool IsHeader(ogg_packet* aPacket) override;
  nsresult PageIn(ogg_page* aPage) override;
  const TrackInfo* GetInfo() const override { return &mInfo; }

  // Return a hash table with tag metadata.
  UniquePtr<MetadataTags> GetTags() override;

 private:
  AudioInfo mInfo;
  vorbis_info mVorbisInfo;
  vorbis_comment mComment;
  vorbis_dsp_state mDsp;
  vorbis_block mBlock;
  OggPacketQueue mHeaders;

  // Returns the end time that a granulepos represents.
  static int64_t Time(vorbis_info* aInfo, int64_t aGranulePos);

  // Reconstructs the granulepos of Vorbis packets stored in the mUnstamped
  // array.
  nsresult ReconstructVorbisGranulepos();

  // The "block size" of the previously decoded Vorbis packet, or 0 if we've
  // not yet decoded anything. This is used to calculate the number of samples
  // in a Vorbis packet, since each Vorbis packet depends on the previous
  // packet while being decoded.
  long mPrevVorbisBlockSize;

  // Granulepos (end sample) of the last decoded Vorbis packet. This is used
  // to calculate the Vorbis granulepos when we don't find a granulepos to
  // back-propagate from.
  int64_t mGranulepos;

#ifdef VALIDATE_VORBIS_SAMPLE_CALCULATION
  // When validating that we've correctly predicted Vorbis packets' number
  // of samples, we store each packet's predicted number of samples in this
  // map, and verify we decode the predicted number of samples.
  std::map<ogg_packet*, long> mVorbisPacketSamples;
#endif

  // Records that aPacket is predicted to have aSamples samples.
  // This function has no effect if VALIDATE_VORBIS_SAMPLE_CALCULATION
  // is not defined.
  void RecordVorbisPacketSamples(ogg_packet* aPacket, long aSamples);

  // Verifies that aPacket has had its number of samples predicted.
  // This function has no effect if VALIDATE_VORBIS_SAMPLE_CALCULATION
  // is not defined.
  void AssertHasRecordedPacketSamples(ogg_packet* aPacket);

 public:
  // Asserts that the number of samples predicted for aPacket is aSamples.
  // This function has no effect if VALIDATE_VORBIS_SAMPLE_CALCULATION
  // is not defined.
  void ValidateVorbisPacketSamples(ogg_packet* aPacket, long aSamples);
};

// Returns 1 if the Theora info struct is decoding a media of Theora
// version (maj,min,sub) or later, otherwise returns 0.
int TheoraVersion(th_info* info, unsigned char maj, unsigned char min,
                  unsigned char sub);

class TheoraState : public OggCodecState {
 public:
  explicit TheoraState(ogg_page* aBosPage);
  virtual ~TheoraState();

  CodecType GetType() override { return TYPE_THEORA; }
  bool DecodeHeader(OggPacketPtr aPacket) override;
  int64_t Time(int64_t granulepos) override;
  int64_t StartTime(int64_t granulepos) override;
  int64_t PacketDuration(ogg_packet* aPacket) override;
  bool Init() override;
  nsresult Reset() override;
  bool IsHeader(ogg_packet* aPacket) override;
  bool IsKeyframe(ogg_packet* aPacket) override;
  nsresult PageIn(ogg_page* aPage) override;
  const TrackInfo* GetInfo() const override { return &mInfo; }
  int64_t MaxKeyframeOffset() override;
  int32_t KeyFrameGranuleJobs() override {
    return mTheoraInfo.keyframe_granule_shift;
  }

 private:
  // Returns the end time that a granulepos represents.
  static int64_t Time(th_info* aInfo, int64_t aGranulePos);

  th_info mTheoraInfo;
  th_comment mComment;
  th_setup_info* mSetup;
  th_dec_ctx* mCtx;

  VideoInfo mInfo;
  OggPacketQueue mHeaders;

  // Reconstructs the granulepos of Theora packets stored in the
  // mUnstamped array. mUnstamped must be filled with consecutive packets from
  // the stream, with the last packet having a known granulepos. Using this
  // known granulepos, and the known frame numbers, we recover the granulepos
  // of all frames in the array. This enables us to determine their timestamps.
  void ReconstructTheoraGranulepos();
};

class OpusState : public OggCodecState {
 public:
  explicit OpusState(ogg_page* aBosPage);
  virtual ~OpusState();

  CodecType GetType() override { return TYPE_OPUS; }
  bool DecodeHeader(OggPacketPtr aPacket) override;
  int64_t Time(int64_t aGranulepos) override;
  int64_t PacketDuration(ogg_packet* aPacket) override;
  bool Init() override;
  nsresult Reset() override;
  nsresult Reset(bool aStart);
  bool IsHeader(ogg_packet* aPacket) override;
  nsresult PageIn(ogg_page* aPage) override;
  already_AddRefed<MediaRawData> PacketOutAsMediaRawData() override;
  const TrackInfo* GetInfo() const override { return &mInfo; }

  // Returns the end time that a granulepos represents.
  static int64_t Time(int aPreSkip, int64_t aGranulepos);

  // Construct and return a table of tags from the metadata header.
  UniquePtr<MetadataTags> GetTags() override;

 private:
  nsAutoPtr<OpusParser> mParser;
  OpusMSDecoder* mDecoder;

  // Granule position (end sample) of the last decoded Opus packet. This is
  // used to calculate the amount we should trim from the last packet.
  int64_t mPrevPacketGranulepos;

  // Reconstructs the granulepos of Opus packets stored in the
  // mUnstamped array. mUnstamped must be filled with consecutive packets from
  // the stream, with the last packet having a known granulepos. Using this
  // known granulepos, and the known frame numbers, we recover the granulepos
  // of all frames in the array. This enables us to determine their timestamps.
  bool ReconstructOpusGranulepos();

  // Granule position (end sample) of the last decoded Opus page. This is
  // used to calculate the Opus per-packet granule positions on the last page,
  // where we may need to trim some samples from the end.
  int64_t mPrevPageGranulepos;
  AudioInfo mInfo;
  OggPacketQueue mHeaders;
};

// Constructs a 32bit version number out of two 16 bit major,minor
// version numbers.
#define SKELETON_VERSION(major, minor) (((major) << 16) | (minor))

enum EMsgHeaderType {
  eContentType,
  eRole,
  eName,
  eLanguage,
  eTitle,
  eDisplayHint,
  eAltitude,
  eTrackOrder,
  eTrackDependencies
};

typedef struct {
  const char* mPatternToRecognize;
  EMsgHeaderType mMsgHeaderType;
} FieldPatternType;

// Stores the message information for different logical bitstream.
typedef struct {
  nsClassHashtable<nsUint32HashKey, nsCString> mValuesStore;
} MessageField;

class SkeletonState : public OggCodecState {
 public:
  explicit SkeletonState(ogg_page* aBosPage);
  ~SkeletonState();

  nsClassHashtable<nsUint32HashKey, MessageField> mMsgFieldStore;

  CodecType GetType() override { return TYPE_SKELETON; }
  bool DecodeHeader(OggPacketPtr aPacket) override;
  int64_t Time(int64_t granulepos) override { return -1; }
  bool IsHeader(ogg_packet* aPacket) override { return true; }

  // Return true if the given time (in milliseconds) is within
  // the presentation time defined in the skeleton track.
  bool IsPresentable(int64_t aTime) { return aTime >= mPresentationTime; }

  // Stores the offset of the page on which a keyframe starts,
  // and its presentation time.
  class nsKeyPoint {
   public:
    nsKeyPoint() : mOffset(INT64_MAX), mTime(INT64_MAX) {}

    nsKeyPoint(int64_t aOffset, int64_t aTime)
        : mOffset(aOffset), mTime(aTime) {}

    // Offset from start of segment/link-in-the-chain in bytes.
    int64_t mOffset;

    // Presentation time in usecs.
    int64_t mTime;

    bool IsNull() { return mOffset == INT64_MAX && mTime == INT64_MAX; }
  };

  // Stores a keyframe's byte-offset, presentation time and the serialno
  // of the stream it belongs to.
  class nsSeekTarget {
   public:
    nsSeekTarget() : mSerial(0) {}
    nsKeyPoint mKeyPoint;
    uint32_t mSerial;
    bool IsNull() { return mKeyPoint.IsNull() && mSerial == 0; }
  };

  // Determines from the seek index the keyframe which you must seek back to
  // in order to get all keyframes required to render all streams with
  // serialnos in aTracks, at time aTarget.
  nsresult IndexedSeekTarget(int64_t aTarget, nsTArray<uint32_t>& aTracks,
                             nsSeekTarget& aResult);

  bool HasIndex() const { return mIndex.Count() > 0; }

  // Returns the duration of the active tracks in the media, if we have
  // an index. aTracks must be filled with the serialnos of the active tracks.
  // The duration is calculated as the greatest end time of all active tracks,
  // minus the smalled start time of all the active tracks.
  nsresult GetDuration(const nsTArray<uint32_t>& aTracks, int64_t& aDuration);

 private:
  // Decodes an index packet. Returns false on failure.
  bool DecodeIndex(ogg_packet* aPacket);
  // Decodes an fisbone packet. Returns false on failure.
  bool DecodeFisbone(ogg_packet* aPacket);

  // Gets the keypoint you must seek to in order to get the keyframe required
  // to render the stream at time aTarget on stream with serial aSerialno.
  nsresult IndexedSeekTargetForTrack(uint32_t aSerialno, int64_t aTarget,
                                     nsKeyPoint& aResult);

  // Version of the decoded skeleton track, as per the SKELETON_VERSION macro.
  uint32_t mVersion;

  // Presentation time of the resource in milliseconds
  int64_t mPresentationTime;

  // Length of the resource in bytes.
  int64_t mLength;

  // Stores the keyframe index and duration information for a particular
  // stream.
  class nsKeyFrameIndex {
   public:
    nsKeyFrameIndex(int64_t aStartTime, int64_t aEndTime)
        : mStartTime(aStartTime), mEndTime(aEndTime) {
      MOZ_COUNT_CTOR(nsKeyFrameIndex);
    }

    ~nsKeyFrameIndex() { MOZ_COUNT_DTOR(nsKeyFrameIndex); }

    void Add(int64_t aOffset, int64_t aTimeMs) {
      mKeyPoints.AppendElement(nsKeyPoint(aOffset, aTimeMs));
    }

    const nsKeyPoint& Get(uint32_t aIndex) const { return mKeyPoints[aIndex]; }

    uint32_t Length() const { return mKeyPoints.Length(); }

    // Presentation time of the first sample in this stream in usecs.
    const int64_t mStartTime;

    // End time of the last sample in this stream in usecs.
    const int64_t mEndTime;

   private:
    nsTArray<nsKeyPoint> mKeyPoints;
  };

  // Maps Ogg serialnos to the index-keypoint list.
  nsClassHashtable<nsUint32HashKey, nsKeyFrameIndex> mIndex;
};

class FlacState : public OggCodecState {
 public:
  explicit FlacState(ogg_page* aBosPage);

  CodecType GetType() override { return TYPE_FLAC; }
  bool DecodeHeader(OggPacketPtr aPacket) override;
  int64_t Time(int64_t granulepos) override;
  int64_t PacketDuration(ogg_packet* aPacket) override;
  bool IsHeader(ogg_packet* aPacket) override;
  nsresult PageIn(ogg_page* aPage) override;

  // Return a hash table with tag metadata.
  UniquePtr<MetadataTags> GetTags() override;

  const TrackInfo* GetInfo() const override;

 private:
  bool ReconstructFlacGranulepos(void);

  FlacFrameParser mParser;
};

}  // namespace mozilla

#endif