Bug 608166 - Add ARM assembly optimizations for libtheora r=chris.double,tterribe,khuey a=b-f
authorTimothy B. Terriberry <tterribe@vt.edu>
Mon, 08 Nov 2010 09:47:34 +0200
changeset 57091 af89c96d0939db005d103a6b86cd6efc8590be4d
parent 57090 2ef1a570e14eedbb0b6e9595ace7a78d77ac2298
child 57092 75ca0ab361b823dc9376bf6ddb56d3304b66b436
push id16784
push userromaxa@gmail.com
push dateMon, 08 Nov 2010 07:45:10 +0000
treeherdermozilla-central@af89c96d0939 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerschris, b-f
bugs608166
milestone2.0b8pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 608166 - Add ARM assembly optimizations for libtheora r=chris.double,tterribe,khuey a=b-f
media/libtheora/AUTHORS
media/libtheora/CHANGES
media/libtheora/README
media/libtheora/README_MOZILLA
media/libtheora/bug559343.patch
media/libtheora/include/theora/config.h
media/libtheora/include/theora/theora.h
media/libtheora/include/theora/theoradec.h
media/libtheora/include/theora/theoraenc.h
media/libtheora/lib/Makefile.in
media/libtheora/lib/apiwrapper.h
media/libtheora/lib/arm/arm2gnu.pl
media/libtheora/lib/arm/armbits.h
media/libtheora/lib/arm/armbits.s
media/libtheora/lib/arm/armcpu.c
media/libtheora/lib/arm/armcpu.h
media/libtheora/lib/arm/armfrag.s
media/libtheora/lib/arm/armidct.s
media/libtheora/lib/arm/armint.h
media/libtheora/lib/arm/armloop.s
media/libtheora/lib/arm/armopts.s
media/libtheora/lib/arm/armstate.c
media/libtheora/lib/bitpack.c
media/libtheora/lib/bitpack.h
media/libtheora/lib/config.h
media/libtheora/lib/cpu.c
media/libtheora/lib/cpu.h
media/libtheora/lib/decinfo.c
media/libtheora/lib/decint.h
media/libtheora/lib/decode.c
media/libtheora/lib/encint.h
media/libtheora/lib/encoder_disabled.c
media/libtheora/lib/enquant.h
media/libtheora/lib/fragment.c
media/libtheora/lib/huffdec.c
media/libtheora/lib/huffdec.h
media/libtheora/lib/huffenc.h
media/libtheora/lib/idct.c
media/libtheora/lib/internal.c
media/libtheora/lib/internal.h
media/libtheora/lib/mathops.h
media/libtheora/lib/quant.c
media/libtheora/lib/state.c
media/libtheora/lib/state.h
media/libtheora/lib/x86/mmxfrag.c
media/libtheora/lib/x86/mmxfrag.h
media/libtheora/lib/x86/mmxidct.c
media/libtheora/lib/x86/mmxloop.h
media/libtheora/lib/x86/mmxstate.c
media/libtheora/lib/x86/sse2idct.c
media/libtheora/lib/x86/sse2trans.h
media/libtheora/lib/x86/x86cpu.c
media/libtheora/lib/x86/x86cpu.h
media/libtheora/lib/x86/x86int.h
media/libtheora/lib/x86/x86state.c
media/libtheora/lib/x86_vc/mmxfrag.c
media/libtheora/lib/x86_vc/mmxfrag.h
media/libtheora/lib/x86_vc/mmxidct.c
media/libtheora/lib/x86_vc/mmxstate.c
media/libtheora/lib/x86_vc/x86cpu.c
media/libtheora/lib/x86_vc/x86cpu.h
media/libtheora/lib/x86_vc/x86int.h
media/libtheora/lib/x86_vc/x86state.c
media/libtheora/update.sh
--- a/media/libtheora/AUTHORS
+++ b/media/libtheora/AUTHORS
@@ -11,16 +11,19 @@ Dan B. Miller
 	
 Rudolf Marek
 Wim Tayman
 Dan Lenski
 Nils Pipenbrinck
 Monty
 	- MMX optimized functions
 	
+David Schleef
+	- C64x port
+	
 Aaron Colwell
 Thomas Vander Stichele
 Jan Gerber
 Conrad Parker
 Cristian Adam
 Sebastian Pippin
 Simon Hosie
 	- Bug fixes, enhancements, build systems.
@@ -40,10 +43,12 @@ Brendan Cully
 Edward Hervey
 Adam Moss
 Colin Ward
 Jeremy C. Reed
 Arc Riley
 Rodolphe Ortalo
 	- Bug fixes
 
+Robin Watts
+	- ARM code optimisations
 
 and other Xiph.org contributors
--- a/media/libtheora/CHANGES
+++ b/media/libtheora/CHANGES
@@ -1,11 +1,31 @@
+libteora 1.2.0alpha1 (2010 September 23)
+
+- New 'ptalarbvorm' encoder with better rate/distortion optimization
+- New th_encode_ctl option for copying configuration from an existing
+  setup header, useful for splicing streams.
+- Returns TH_DUPFRAME in more cases.
+- Add ARM optimizations
+- Add TI C64x+ DSP optimizations
+- Other performance improvements
+- Rename speedlevel 2 to 3 and provide a new speedlevel 2
+- Various minor bug fixes
+
 libtheora 1.1.2 (unreleased snapshot)
 
- - no changes recorded
+ - Fix Huffman table decoding with OC_HUFF_SLUSH is set to 0
+ - Fix a frame size bug in player_example
+ - Add support for passing a buffer the size of the picture
+   region, rather than a full padded frame to th_encode_ycbcr_in()
+   as was possible with the legacy pre-1.0 API.
+ - 4:4:4 support in player_example using software yuv->rgb
+ - Better rgb->yuv conversion in png2theora
+ - Clean up warnings and local variables
+ - Build and documentation fixes
 
 libtheora 1.1.1 (2009 October 1)
 
  - Fix problems with MSVC inline assembly
  - Add the missing encoder_disabled.c to the distribution
  - build updates: autogen.sh should work better after switching systems
    and the MSVC project now defaults to the dynamic runtime library
  - Namespace some variables to avoid conflicts on wince.
@@ -123,17 +143,17 @@ libtheora 1.0beta2 (2007 October 12)
  - Build fixes for MingW32, MSVC
  - Improved format documentation.
 
 libtheora 1.0beta1 (2007 September 22)
 
  - Granulepos scheme modified to match other codecs. This bumps
    the bitstream revision to 3.2.1. Bitstreams marked 3.2.0 are
    handled correctly by this decoder. Older decoders will show
-   a one frame sync error in the less noticable direction.
+   a one frame sync error in the less noticeable direction.
 
 libtheora 1.0alpha8 (2007 September 18)
 
  - Switch to new spec compliant decoder from theora-exp branch.
    Written by Dr. Timothy Terriberry.
  - Add support to the encoder for using quantization settings
    provided by the application.
  - more assembly optimizations
--- a/media/libtheora/README
+++ b/media/libtheora/README
@@ -1,26 +1,29 @@
 -------------------------------------------------------------------------
-             The Xiph.org Foundation's libtheora 1.1 
+             The Xiph.org Foundation's libtheora 1.2
 -------------------------------------------------------------------------
 
 *** What is Theora?
 
 Theora is Xiph.Org's first publicly released video codec, intended
 for use within the Foundation's Ogg multimedia streaming system.
 Theora is derived directly from On2's VP3 codec, adds new features
 while allow it a longer useful lifetime as an competitive codec.
 
 The 1.0 release decoder supported all the new features, but the
 encoder is nearly identical to the VP3 code.
 
-The 1.1 release features a completely rewritten encoder, offering
+The 1.1 release featured a completely rewritten encoder, offering
 better performance and compression, and making more complete use
-of the format's feature set. Files produced by both encoders can
-be decoded by either release.
+of the format's feature set.
+
+The 1.2 release features significant additional improvements in
+compression and performance. Files produced by newer encoders can
+be decoded by earlier releases.
 
 *** Where is Theora?
 
 Theora's main site is www.theora.org.  Theora and related libraries
 can be gotten from www.theora.org or the main Xiph.Org site at
 www.xiph.org.  Development source is kept in an open subversion 
 repository, see http://theora.org/svn/ for instructions.
 
@@ -36,16 +39,17 @@ Requirements summary:
 
       libogg 1.1 or newer.
 
   For example encoder:
 
       as above,
 
       libvorbis and libvorbisenc 1.0.1 or newer.
+      (libvorbis 1.3.1 or newer for 5.1 audio)
 
   For creating a source distribution package:
 
       as above,
 
       Doxygen to build the API documentation,
       pdflatex and fig2dev to build the format specification
         (transfig package in Ubuntu).
@@ -61,17 +65,17 @@ The provided build system is the GNU aut
 the main library, libtheora, should already build smoothly on any
 system.  Failure of libtheora to build on a GNU-enabled system is
 considered a bug; please report problems to theora-dev@xiph.org.
 
 Windows build support is included in the win32 directory.
 
 Project files for Apple XCode are included in the macosx directory.
 
-There is also an experimental scons build.
+There is also a more limited scons build.
 
 *** How do I use the sample encoder?
 
 The sample encoder takes raw video in YUV4MPEG2 format, as used by
 lavtools, mjpeg-tools and other packages. The encoder expects audio,
 if any, in a separate wave WAV file. Try 'encoder_example -h' for a 
 complete list of options.
 
--- a/media/libtheora/README_MOZILLA
+++ b/media/libtheora/README_MOZILLA
@@ -1,7 +1,5 @@
 The source from this directory was copied from the theora subversion trunk
 using the update.sh script. The changes made were those applied by update.sh,
 the addition/update of Makefile.in files for the Mozilla build system.
 
-The subversion revision used was r16712.
-
-bug559343.patch: Silence Coverity warning.
+The subversion revision used was r17578.
deleted file mode 100644
--- a/media/libtheora/bug559343.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/media/libtheora/lib/state.c b/media/libtheora/lib/state.c
---- a/media/libtheora/lib/state.c
-+++ b/media/libtheora/lib/state.c
-@@ -87,17 +87,17 @@ static void oc_sb_create_plane_mapping(o
-       int       quadi;
-       int       i;
-       /*Figure out how many rows of blocks in this super block lie within the
-          image.*/
-       jmax=_hfrags-x;
-       if(jmax>4)jmax=4;
-       else if(jmax<=0)break;
-       /*By default, set all fragment indices to -1.*/
--      memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
-+      memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
-       /*Fill in the fragment map for this super block.*/
-       xfrag=yfrag+x;
-       for(i=0;i<imax;i++){
-         int j;
-         for(j=0;j<jmax;j++){
-           _sb_maps[sbi][SB_MAP[i][j][0]][SB_MAP[i][j][1]]=xfrag+j;
-         }
-         xfrag+=_hfrags;
deleted file mode 100644
--- a/media/libtheora/include/theora/config.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* libcairo is available for visual debugging output */
-/* #undef HAVE_CAIRO */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <machine/soundcard.h> header file. */
-/* #undef HAVE_MACHINE_SOUNDCARD_H */
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <soundcard.h> header file. */
-/* #undef HAVE_SOUNDCARD_H */
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/soundcard.h> header file. */
-/* #undef HAVE_SYS_SOUNDCARD_H */
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-/* #undef NO_MINUS_C_MINUS_O */
-
-/* make use of x86_64 asm optimization */
-/* #undef OC_X86_64_ASM */
-
-/* make use of x86 asm optimization */
- /**/
-
-/* Name of package */
-#define PACKAGE "libtheora"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "libtheora"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libtheora 1.1.1+svn"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "libtheora"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.1.1+svn"
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Define to exclude encode support from the build */
-/* #undef THEORA_DISABLE_ENCODE */
-
-/* Define to exclude floating point code from the build */
-/* #undef THEORA_DISABLE_FLOAT */
-
-/* Version number of package */
-#define VERSION "1.1.1+svn"
--- a/media/libtheora/include/theora/theora.h
+++ b/media/libtheora/include/theora/theora.h
@@ -174,17 +174,17 @@ typedef enum {
  * These enumerate the available chroma subsampling options supported
  * by the theora format. See Section 4.4 of the specification for
  * exact definitions.
  */
 typedef enum {
   OC_PF_420,    /**< Chroma subsampling by 2 in each direction (4:2:0) */
   OC_PF_RSVD,   /**< Reserved value */
   OC_PF_422,    /**< Horizonatal chroma subsampling by 2 (4:2:2) */
-  OC_PF_444,    /**< No chroma subsampling at all (4:4:4) */
+  OC_PF_444     /**< No chroma subsampling at all (4:4:4) */
 } theora_pixelformat;
 
 /**
  * Theora bitstream info.
  * Contains the basic playback parameters for a stream,
  * corresponding to the initial 'info' header packet.
  * 
  * Encoded theora frames must be a multiple of 16 in width and height.
--- a/media/libtheora/include/theora/theoradec.h
+++ b/media/libtheora/include/theora/theoradec.h
@@ -278,17 +278,18 @@ extern int th_decode_ctl(th_dec_ctx *_de
  *                  packet is stored in this location.
  *                 This is computed incrementally from previously decoded
  *                  packets.
  *                 After a seek, the correct granule position must be set via
  *                  #TH_DECCTL_SET_GRANPOS for this to work properly.
  * \retval 0             Success.
  *                       A new decoded frame can be retrieved by calling
  *                        th_decode_ycbcr_out().
- * \retval TH_DUPFRAME   The packet represented a dropped (0-byte) frame.
+ * \retval TH_DUPFRAME   The packet represented a dropped frame (either a
+ *                        0-byte frame or an INTER frame with no coded blocks).
  *                       The player can skip the call to th_decode_ycbcr_out(),
  *                        as the contents of the decoded frame buffer have not
  *                        changed.
  * \retval TH_EFAULT     \a _dec or \a _op was <tt>NULL</tt>.
  * \retval TH_EBADPACKET \a _op does not contain encoded video data.
  * \retval TH_EIMPL      The video data uses bitstream features which this
  *                        library does not support.*/
 extern int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
--- a/media/libtheora/include/theora/theoraenc.h
+++ b/media/libtheora/include/theora/theoraenc.h
@@ -38,47 +38,47 @@ extern "C" {
  * Keep any experimental or vendor-specific values above \c 0x8000.*/
 /*@{*/
 /**Sets the Huffman tables to use.
  * The tables are copied, not stored by reference, so they can be freed after
  *  this call.
  * <tt>NULL</tt> may be specified to revert to the default tables.
  *
  * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
  * \retval TH_EINVAL Encoding has already begun or one or more of the given
  *                     tables is not full or prefix-free, \a _buf is
  *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
  *                     non-<tt>NULL</tt> and \a _buf_sz is not
  *                     <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_HUFFMAN_CODES (0)
 /**Sets the quantization parameters to use.
  * The parameters are copied, not stored by reference, so they can be freed
  *  after this call.
  * <tt>NULL</tt> may be specified to revert to the default parameters.
  *
  * \param[in] _buf #th_quant_info
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
  * \retval TH_EINVAL Encoding has already begun, \a _buf is 
  *                    <tt>NULL</tt> and \a _buf_sz is not zero,
  *                    or \a _buf is non-<tt>NULL</tt> and
  *                    \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_QUANT_PARAMS (2)
 /**Sets the maximum distance between key frames.
  * This can be changed during an encode, but will be bounded by
  *  <tt>1<<th_info#keyframe_granule_shift</tt>.
  * If it is set before encoding begins, th_info#keyframe_granule_shift will
  *  be enlarged appropriately.
  *
  * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
  *                   frames.
  * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
 /**Disables any encoder features that would prevent lossless transcoding back
  *  to VP3.
  * This primarily means disabling block-adaptive quantization and always coding
  *  all four luma blocks in a macro block when 4MV is used.
  * It also includes using the VP3 quantization tables and Huffman codes; if you
@@ -96,40 +96,40 @@ extern "C" {
  * \param[in]  _buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
  *                   or 0 to disable it (the default).
  * \param[out] _buf <tt>int</tt>: 1 if all bitstream features required for
  *                   VP3-compatibility could be set, and 0 otherwise.
  *                  The latter will be returned if the pixel format is not
  *                   4:2:0, the picture region is smaller than the full frame,
  *                   or if encoding has begun, preventing the quantization
  *                   tables and codebooks from being set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
 /**Gets the maximum speed level.
  * Higher speed levels favor quicker encoding over better quality per bit.
  * Depending on the encoding mode, and the internal algorithms used, quality
  *  may actually improve, but in this case bitrate will also likely increase.
  * In any case, overall rate/distortion performance will probably decrease.
  * The maximum value, and the meaning of each value, may change depending on
  *  the current encoding mode (VBR vs. constant quality, etc.).
  *
  * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_GET_SPLEVEL_MAX (12)
 /**Sets the speed level.
  * The current speed level may be retrieved using #TH_ENCCTL_GET_SPLEVEL.
  *
  * \param[in] _buf <tt>int</tt>: The new encoding speed level.
  *                 0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    encoding speed level is out of bounds.
  *                   The maximum encoding speed level may be
  *                    implementation- and encoding mode-specific, and can be
  *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_SET_SPLEVEL (14)
@@ -137,17 +137,17 @@ extern "C" {
  * The default speed level may vary according to encoder implementation, but if
  *  this control code is not supported (it returns #TH_EIMPL), the default may
  *  be assumed to be the slowest available speed (0).
  * The maximum encoding speed level may be implementation- and encoding
  *  mode-specific, and can be obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
  *
  * \param[out] _buf <tt>int</tt>: The current encoding speed level.
  *                  0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_GET_SPLEVEL (16)
 /**Sets the number of duplicates of the next frame to produce.
  * Although libtheora can encode duplicate frames very cheaply, it costs some
  *  amount of CPU to detect them, and a run of duplicates cannot span a
  *  keyframe boundary.
@@ -157,17 +157,17 @@ extern "C" {
  *  rate control decisions, and reduces CPU usage as well, when compared to
  *  just submitting the same frame for encoding multiple times.
  * This setting only applies to the next frame submitted for encoding.
  * You MUST call th_encode_packetout() repeatedly until it returns 0, or the
  *  extra duplicate frames will be lost.
  *
  * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
  *                 If this is negative or zero, no duplicates will be produced.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    number of duplicates is greater than or equal to the
  *                    maximum keyframe interval.
  *                   In the latter case, NO duplicate frames will be produced.
  *                   You must ensure that the maximum keyframe interval is set
  *                    larger than the maximum number of duplicates you will
  *                    ever wish to insert prior to encoding.
  * \retval TH_EIMPL   Not supported by this implementation in the current
@@ -182,17 +182,17 @@ extern "C" {
  *
  * \param[in] _buf <tt>int</tt>: Any combination of
  *                  \ref ratectlflags "the available flags":
  *                 - #TH_RATECTL_DROP_FRAMES: Enable frame dropping.
  *                 - #TH_RATECTL_CAP_OVERFLOW: Don't bank excess bits for later
  *                    use.
  *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
  *                    later.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
  *                    is not enabled.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_SET_RATE_FLAGS (20)
 /**Sets the size of the bitrate management bit reservoir as a function
  *  of number of frames.
  * The reservoir size affects how quickly bitrate management reacts to
@@ -206,17 +206,17 @@ extern "C" {
  * An implementation may impose some limits on the size of a reservoir it can
  *  handle, in which case the actual reservoir size may not be exactly what was
  *  requested.
  * The actual value set will be returned.
  *
  * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
  *                   frames.
  * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
  *                    is not enabled.  The buffer has an implementation
  *                    defined minimum and maximum size and the value in _buf
  *                    will be adjusted to match the actual value set.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_SET_RATE_BUFFER (22)
 /**Enable pass 1 of two-pass encoding mode and retrieve the first pass metrics.
@@ -238,17 +238,17 @@ extern "C" {
  *
  * \param[out] <tt>char *</tt>_buf: Returns a pointer to internal storage
  *              containing the two pass metrics data.
  *             This storage is only valid until the next call, or until the
  *              encoder context is freed, and must be copied by the
  *              application.
  * \retval >=0       The number of bytes of metric data available in the
  *                    returned buffer.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
  *                    bitrate has been set, or the first call was made after
  *                    the first frame was submitted for encoding.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_2PASS_OUT (24)
 /**Submits two-pass encoding metric data collected the first encoding pass to
  *  the second pass.
  * The first call must be made before the first frame is encoded, and a target
@@ -278,17 +278,17 @@ extern "C" {
  *                 The summary data returned at the end of pass 1 must be at
  *                  the head of the buffer on the first call with a
  *                  non-<tt>NULL</tt> \a _buf, and the placeholder data
  *                  returned at the start of pass 1 should be omitted.
  *                 After each call you should advance this buffer by the number
  *                  of bytes consumed.
  * \retval >0            The number of bytes of metric data required/consumed.
  * \retval 0             No more data is required before the next frame.
- * \retval TH_EFAULT     \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc is <tt>NULL</tt>.
  * \retval TH_EINVAL     No target bitrate has been set, or the first call was
  *                        made after the first frame was submitted for
  *                        encoding.
  * \retval TH_ENOTFORMAT The data did not appear to be pass 1 from a compatible
  *                        implementation of this library.
  * \retval TH_EBADHEADER The data was invalid; this may be returned when
  *                        attempting to read an aborted pass 1 file that still
  *                        has the placeholder data in place of the summary
@@ -301,17 +301,17 @@ extern "C" {
  *  #TH_ENCCTL_SET_BITRATE (this restriction may be relaxed in a future
  *  version).
  * If it is set before the headers are emitted, the target quality encoded in
  *  them will be updated.
  *
  * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
  *                  inclusive.
  * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL     A target bitrate has already been specified, or the
  *                        quality index was not in the range 0...63.
  * \retval TH_EIMPL       Not supported by this implementation.*/
 #define TH_ENCCTL_SET_QUALITY (28)
 /**Sets the current encoding bitrate.
  * Once a bitrate is set, the encoder must use a rate-controlled mode for all
  *  future frames (this restriction may be relaxed in a future version).
  * If it is set before the headers are emitted, the target bitrate encoded in
@@ -323,20 +323,60 @@ extern "C" {
  *  used fewer, holding the excess in reserve.
  * The exact transition between the two bitrates is not well-defined by this
  *  API, but may be affected by flags set with #TH_ENCCTL_SET_RATE_FLAGS.
  * After a number of frames equal to the buffer delay, one may expect further
  *  output to average at the target bitrate.
  *
  * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
  * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL     The target bitrate was not positive.
  * \retval TH_EIMPL       Not supported by this implementation.*/
 #define TH_ENCCTL_SET_BITRATE (30)
+/**Sets the configuration to be compatible with that from the given setup
+ *  header.
+ * This sets the Huffman codebooks and quantization parameters to match those
+ *  found in the given setup header.
+ * This guarantees that packets encoded by this encoder will be decodable using
+ *  a decoder configured with the passed-in setup header.
+ * It does <em>not</em> guarantee that th_encode_flushheader() will produce a
+ *  bit-identical setup header, only that they will be compatible.
+ * If you need a bit-identical setup header, then use the one you passed into
+ *  this command, and not the one returned by th_encode_flushheader().
+ *
+ * This also does <em>not</em> enable or disable VP3 compatibility; that is not
+ *  signaled in the setup header (or anywhere else in the encoded stream), and
+ *  is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
+ * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
+ *  and quantization parameters to match the given setup header, you should
+ *  enable VP3 compatibility before invoking this command, otherwise the
+ *  codebooks and quantization parameters will be reset to the VP3 defaults.
+ *
+ * The current encoder does not support Huffman codebooks which do not contain
+ *  codewords for all 32 tokens.
+ * Such codebooks are legal, according to the specification, but cannot be
+ *  configured with this function.
+ *
+ * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
+ *                                            the configuration from.
+ *                                           This should be the original,
+ *                                            undecoded setup header packet,
+ *                                            and <em>not</em> a #th_setup_info
+ *                                            structure filled in by
+ *                                            th_decode_headerin().
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     Encoding has already begun, so the codebooks and
+ *                        quantization parameters cannot be changed, or the
+ *                        data in the setup header was not supported by this
+ *                        encoder.
+ * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
+ * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
 
 /*@}*/
 
 
 /**\name TH_ENCCTL_SET_RATE_FLAGS flags
  * \anchor ratectlflags
  * These are the flags available for use with #TH_ENCCTL_SET_RATE_FLAGS.*/
 /*@{*/
@@ -436,21 +476,35 @@ extern int th_encode_ctl(th_enc_ctx *_en
  *          produced.
  * \retval 0         No packet was produced, and no more header packets remain.
  * \retval TH_EFAULT \a _enc, \a _comments, or \a _op was <tt>NULL</tt>.*/
 extern int th_encode_flushheader(th_enc_ctx *_enc,
  th_comment *_comments,ogg_packet *_op);
 /**Submits an uncompressed frame to the encoder.
  * \param _enc   A #th_enc_ctx handle.
  * \param _ycbcr A buffer of Y'CbCr data to encode.
+ *               If the width and height of the buffer matches the frame size
+ *                the encoder was initialized with, the encoder will only
+ *                reference the portion inside the picture region.
+ *               Any data outside this region will be ignored, and need not map
+ *                to a valid address.
+ *               Alternatively, you can pass a buffer equal to the size of the
+ *                picture region, if this is less than the full frame size.
+ *               When using subsampled chroma planes, odd picture sizes or odd
+ *                picture offsets may require an unexpected chroma plane size,
+ *                and their use is generally discouraged, as they will not be
+ *                well-supported by players and other media frameworks.
+ *               See Section 4.4 of 
+ *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *                specification</a> for details if you wish to use them anyway.
  * \retval 0         Success.
  * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
- * \retval TH_EINVAL The buffer size does not match the frame size the encoder
- *                    was initialized with, or encoding has already
- *                    completed.*/
+ * \retval TH_EINVAL The buffer size matches neither the frame size nor the
+ *                    picture size the encoder was initialized with, or
+ *                    encoding has already completed.*/
 extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
 /**Retrieves encoded video data packets.
  * This should be called repeatedly after each frame is submitted to flush any
  *  encoded packets, until it returns 0.
  * The encoder will not buffer these packets as subsequent frames are
  *  compressed, so a failure to do so will result in lost video data.
  * \note Currently the encoder operates in a one-frame-in, one-packet-out
  *        manner.
--- a/media/libtheora/lib/Makefile.in
+++ b/media/libtheora/lib/Makefile.in
@@ -29,75 +29,127 @@
 # use your version of this file under the terms of the MPL, indicate your
 # decision by deleting the provisions above and replace them with the notice
 # and other provisions required by the GPL or the LGPL. If you do not delete
 # the provisions above, a recipient may use your version of this file under
 # the terms of any one of the MPL, the GPL or the LGPL.
 #
 # ***** END LICENSE BLOCK *****
 
-DEPTH		= ../../..
-topsrcdir	= @top_srcdir@
-srcdir		= @srcdir@
+DEPTH     = ../../..
+topsrcdir = @top_srcdir@
+srcdir    = @srcdir@
 
 include $(DEPTH)/config/autoconf.mk
 
-MODULE		= theora
-LIBRARY_NAME	= theora
-FORCE_STATIC_LIB= 1
+MODULE           = theora
+LIBRARY_NAME     = theora
+FORCE_STATIC_LIB = 1
 
 # The encoder is currently not included.
 DEFINES += -DTHEORA_DISABLE_ENCODE
 
 ifeq ($(findstring 86,$(OS_TEST)), 86)
 ifneq ($(OS_ARCH),SunOS)
 ifneq ($(OS_ARCH)$(OS_TEST),WINNTx86_64)
-DEFINES += -DOC_X86_ASM -DUSE_ASM
+DEFINES += -DOC_X86_ASM
+ifeq (64,$(findstring 64,$(OS_TEST)))
+DEFINES += -DOC_X86_64_ASM
+endif
 endif
 endif
 endif
 
-VPATH		:= $(srcdir)
+VPATH := $(srcdir)
 
-CSRCS		= \
-		apiwrapper.c \
-		bitpack.c \
-		decapiwrapper.c \
-		decinfo.c \
-		decode.c \
-		dequant.c \
-		encoder_disabled.c \
-		fragment.c \
-		huffdec.c \
-		idct.c \
-		info.c \
-		internal.c \
-		quant.c \
-		state.c \
-		$(NULL)
+CSRCS = \
+  apiwrapper.c \
+  bitpack.c \
+  decapiwrapper.c \
+  decinfo.c \
+  decode.c \
+  dequant.c \
+  fragment.c \
+  huffdec.c \
+  idct.c \
+  info.c \
+  internal.c \
+  quant.c \
+  state.c \
+  $(NULL)
 
 ifeq ($(findstring 86,$(OS_TEST)), 86)
 ifdef _MSC_VER
 ifneq (64,$(findstring 64,$(OS_TEST)))
-VPATH		+= $(srcdir)/x86_vc
+VPATH += $(srcdir)/x86_vc
 
-CSRCS		+= \
-		mmxidct.c \
-		mmxfrag.c \
-		mmxstate.c \
-		x86state.c \
-		$(NULL)
+CSRCS += \
+  mmxidct.c \
+  mmxfrag.c \
+  mmxstate.c \
+  x86state.c \
+  x86cpu.c \
+  $(NULL)
 endif
 else
-VPATH		+= $(srcdir)/x86
+VPATH += $(srcdir)/x86
+
+CSRCS += \
+  mmxidct.c \
+  mmxfrag.c \
+  mmxstate.c \
+  sse2idct.c \
+  x86state.c \
+  x86cpu.c \
+  $(NULL)
+endif
+endif
+
+ifdef GNU_AS
+ifeq ($(findstring arm,$(OS_TEST)), arm)
+
+VPATH += $(srcdir)/arm
+
+CSRCS += \
+  armcpu.c \
+  armstate.c \
+  $(NULL)
+
+DEFINES += -DOC_ARM_ASM -DOC_ARM_ASM_EDSP -DOC_ARM_ASM_MEDIA -DOC_ARM_ASM_NEON
 
-CSRCS		+= \
-		mmxidct.c \
-		mmxfrag.c \
-		mmxstate.c \
-		x86state.c \
-		$(NULL)
+# The Android NDK doesn't pre-define anything to indicate the OS it's on, so
+# do it for them.
+ifeq ($(OS_TARGET),Android)
+DEFINES += -D__linux__
+endif
+
+THEORA_ASFILES  = \
+  armbits.s \
+  armfrag.s \
+  armidct.s \
+  armloop.s \
+  armopts.s \
+  $(NULL)
+
+ASFILES = $(patsubst %.s,%-gnu.$(ASM_SUFFIX),$(THEORA_ASFILES))
+
+# These flags are a lie; they're just used to enable the requisite
+# opcodes; actual arch detection is done at runtime.
+ASFLAGS = -march=armv7-a -mfpu=neon
+
+armfrag-gnu.$(ASM_SUFFIX): armopts-gnu.S
+armidct-gnu.$(ASM_SUFFIX): armopts-gnu.S
+armloop-gnu.$(ASM_SUFFIX): armopts-gnu.S
+
+# armopts needs a specific rule, because arm2gnu.pl will always add the .S
+# suffix when translating the files that include it.
+armopts-gnu.S: armopts.s
+	$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@
+# For all others, we can use an implicit rule with the configured $(ASM_SUFFIX).
+%-gnu.$(ASM_SUFFIX): %.s
+	$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@
+
 endif
 endif
 
 include $(topsrcdir)/config/rules.mk
 
 LOCAL_INCLUDES = -I$(srcdir)
--- a/media/libtheora/lib/apiwrapper.h
+++ b/media/libtheora/lib/apiwrapper.h
@@ -16,17 +16,17 @@
  ********************************************************************/
 
 #if !defined(_apiwrapper_H)
 # define _apiwrapper_H (1)
 # include <ogg/ogg.h>
 # include <theora/theora.h>
 # include "theora/theoradec.h"
 # include "theora/theoraenc.h"
-# include "internal.h"
+# include "state.h"
 
 typedef struct th_api_wrapper th_api_wrapper;
 typedef struct th_api_info    th_api_info;
 
 /*Provide an entry point for the codec setup to clear itself in case we ever
    want to break pieces off into a common base library shared by encoder and
    decoder.
   In addition, this makes several other pieces of the API wrapper cleaner.*/
new file mode 100755
--- /dev/null
+++ b/media/libtheora/lib/arm/arm2gnu.pl
@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata\n    .align 2/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data\n    .align 2/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        print "    .thumb_func" if ($thumb);
+        s/\bPROC\b/@ $&/;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    s/\bENDP\b/@ $&/;
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+
+            print "        .byte      0x".$w1;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w4;
+        }
+        else
+        {
+            # little endian
+
+            print "        .byte      0x".$w4;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w1;
+        }
+
+    }
+
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armbits.h
@@ -0,0 +1,32 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_pack_read oc_pack_read_arm
+#  define oc_pack_read1 oc_pack_read1_arm
+#  define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armbits.s
@@ -0,0 +1,230 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	EXPORT oc_pack_read_arm
+	EXPORT oc_pack_read1_arm
+	EXPORT oc_huff_token_decode_arm
+
+oc_pack_read1_arm PROC
+	; r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+	ENDP
+
+oc_pack_read_arm PROC
+	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+	MOV r1,#1
+oc_pack_read_refill
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     ; r10 = stop
+	                       ; r11 = ptr
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	RSB r3,r3,r0           ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMP r10,r11            ; ptr<stop => HI
+	CMPHI r3,#7            ;   available<=24 => HI
+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
+	SUBHI r3,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;     ptr<stop => HI
+	CMPHI r3,#7            ;       available<=24 => HI
+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
+	SUBHI r3,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;         ptr<stop => HI
+	CMPHI r3,#7            ;           available<=24 => HI
+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
+	SUBHI r3,#8            ;             available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;             ptr<stop => HI
+	CMPHI r3,#7            ;               available<=24 => HI
+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
+	SUBHI r3,#8            ;                 available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+;  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+	CMP r11,r10            ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+;  to fill up the window.
+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           ; (HS) r14 = 1
+	ADDLO r10,r3,r1        ; (LO) r10 = available
+	STRHS r14,[r12,#8]     ; (HS) eof = 1
+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+	ENDP
+
+
+
+oc_huff_token_decode_arm PROC
+	; r0 = oc_pack_buf       *_b
+	; r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       ; r2 = stop
+	; Stall...             ; r3 = ptr
+	; Stall...             ; r4 = window
+	                       ; r5 = available
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r5,r10          ; r5 = available-=n
+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
+	; Stall...             ; r12 = _tree+node+1
+	; Stall...
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+	ADD r12,r1,#2          ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              ; ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
+	CMPHI r2,r3            ;   ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
+	SUBHI r5,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMPHI r2,r3            ;     ptr<stop => HI
+	CMPHI r5,#7            ;       available<=24 => HI
+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
+	SUBHI r5,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
+	CMP r2,r3              ; ptr<stop => HI
+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            ; (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
+	SUBHI r5,#8            ; (HI)   available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          ; r5 = available
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+	ENDP
+
+	END
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armcpu.c
@@ -0,0 +1,116 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  flags=0;
+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
+     instructions via their assembled hex code.
+    All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OC_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OC_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OC_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t  flags;
+  FILE         *fin;
+  flags=0;
+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+     Android.
+    This also means that detection will fail in Scratchbox.*/
+  fin=fopen("/proc/cpuinfo","r");
+  if(fin!=NULL){
+    /*512 should be enough for anybody (it's even enough for all the flags that
+       x86 has accumulated... so far).*/
+    char buf[512];
+    while(fgets(buf,511,fin)!=NULL){
+      if(memcmp(buf,"Features",8)==0){
+        char *p;
+        p=strstr(buf," edsp");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+        p=strstr(buf," neon");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+      }
+      if(memcmp(buf,"CPU architecture:",17)==0){
+        int version;
+        version=atoi(buf+17);
+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+   accessible in priveleged modes only, so we can't have a general user-space
+   detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform.  Reconfigure with --disable-asm (or send patches)."
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armcpu.h
@@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA    (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP     (1<<7)
+#define OC_CPU_ARM_NEON     (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armfrag.s
@@ -0,0 +1,656 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+; Vanilla ARM v4 versions
+	EXPORT	oc_frag_copy_list_arm
+	EXPORT	oc_frag_recon_intra_arm
+	EXPORT	oc_frag_recon_inter_arm
+	EXPORT	oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp
+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+	ENDP
+
+oc_frag_recon_inter_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src
+	; r2 =       int            ystride
+	; r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+	ENDP
+
+oc_frag_recon_inter2_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src1
+	; r2 = const unsigned char *src2
+	; r3 =       int            ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+	ENDP
+
+ [ OC_ARM_ASM_EDSP
+	EXPORT	oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	; Stall
+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	; Stall
+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
+				; another pair of LDRD/STRD later on.
+	; Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_frag_recon_intra_v6
+	EXPORT	oc_frag_recon_inter_v6
+	EXPORT	oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp
+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		; r2 = __11__00
+	USAT16	r3, #8, r3		; r3 = __33__22
+	USAT16	r4, #8, r4		; r4 = __55__44
+	USAT16	r5, #8, r5		; r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+	ENDP
+
+oc_frag_recon_inter_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp
+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+ ]
+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r6,r4			; r6 = __22__00
+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
+	QADD16	r12,r12,r6		; r12= xx22xx00
+	QADD16	r4, r7, r4		; r4 = xx33xx11
+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		; r4 = __33__11
+	USAT16	r12,#8,r12		; r12= __22__00
+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
+	UXTB16	r6,r5			; r6 = __66__44
+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
+	QADD16	r12,r12,r6		; r12= xx66xx44
+	QADD16	r5, r7, r5		; r5 = xx77xx55
+	USAT16	r12,#8, r12		; r12= __66__44
+	USAT16	r5, #8, r5		; r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_inter2_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp
+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			; r5 = __66__44
+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
+	QADD16	r8, r8, r5		; r8 = xx66xx44
+	QADD16	r9, r9, r4		; r9 = xx77xx55
+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		; r8 = __66__44
+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		; r9 = __77__55
+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r5, r4			; r5 = __22__00
+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
+	QADD16	r8, r8, r5		; r8 = xx22xx00
+	QADD16	r7, r7, r4		; r7 = xx33xx11
+	USAT16	r8, #8, r8		; r8 = __22__00
+	USAT16	r7, #8, r7		; r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_frag_copy_list_neon
+	EXPORT	oc_frag_recon_intra_neon
+	EXPORT	oc_frag_recon_inter_neon
+	EXPORT	oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	; Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	; Stall (on XScale)
+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
+ofcl_neon_lp
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4@64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4@64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4@64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4@64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4@64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4@64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4@64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4@64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5@64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5@64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5@64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5@64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_intra_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	MOV	r3, #128
+	VDUP.S16	Q0, r3
+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0@64], r1
+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0@64], r1
+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0@64], r1
+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0@64], r1
+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0@64], r1
+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0@64], r1
+	VST1.64	{D22},[r0@64], r1
+	VST1.64	{D23},[r0@64], r1
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	; etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r2
+	VST1.64	{D22},[r0@64], r2
+	VST1.64	{D23},[r0@64], r2
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter2_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		; etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r3
+	VST1.64	{D22},[r0@64], r3
+	VST1.64	{D23},[r0@64], r3
+	MOV	PC,R14
+	ENDP
+ ]
+
+	END
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armidct.s
@@ -0,0 +1,1908 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_idct8x8_1_arm
+	EXPORT	oc_idct8x8_arm
+
+oc_idct8x8_1_arm PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_arm PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_arm_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+oc_idct8x8_slow_arm_cols
+; Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_10_arm PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #4*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_10_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#20]
+	STR	r4, [r0,#32]
+	STR	r4, [r0,#48]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_10_arm_cols
+; Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_6_arm PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_6_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_6_arm_cols
+; Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
+
+oc_idct8x8_3_arm PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r2		; Write to the final destination
+; Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
+
+idct1core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+	ENDP
+
+idct2core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]
+	ADD	r11,r11,r9		; r11= t[0]+t[7]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct2core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct3core_arm PROC
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
+					; r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7]
+	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		; r12= t[2]+t2[5]
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	ENDP
+
+idct3core_down_arm PROC
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
+					; r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	ENDP
+
+idct4core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		; r11= t[0]+t2[7]
+	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct4core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct8core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; Stage 2
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7]
+	ADD	r6, r6, r10		; r6 = t[1] + t[6]
+	ADD	r3, r3, r14		; r3 = t[2] + t[5]
+	ADD	r4, r4, r9		; r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	ENDP
+
+idct8core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+	; Stage 2
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	ENDP
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_idct8x8_1_v6
+	EXPORT	oc_idct8x8_v6
+
+oc_idct8x8_1_v6 PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_v6 PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	;CMP	r2, #6
+	;BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+oc_idct8x8_slow_v6_cols
+; Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_10_v6 PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	; Align the stack.
+	ADD	r0, r0, r2	; Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #4*16
+	CMP	r0, r2
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_10_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2]
+	STRD	r4, [r2,#16]
+	STR	r4, [r2,#32]
+	STR	r4, [r2,#48]
+oc_idct8x8_10_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_3_v6 PROC
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2_1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r8
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r8		; Write to the final destination.
+; Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+	ENDP
+
+idct2_1core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		; r6 = x[1,0]
+	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+; Stage 2:
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
+; Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
+	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+; Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+ ]
+
+	ALIGN 8
+OC_C7S1
+	DCD	12785 ; 31F1
+OC_C1S7
+	DCD	64277 ; FB15
+OC_C6S2
+	DCD	25080 ; 61F8
+OC_C2S6
+	DCD	60547 ; EC83
+OC_C5S3
+	DCD	36410 ; 8E3A
+OC_C3S5
+	DCD	54491 ; D4DB
+OC_C4S4
+	DCD	46341 ; B505
+
+ [ OC_ARM_ASM_MEDIA
+idct2_2core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			; r7  = 8
+	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
+; Stage 2:
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
+; Stage 3:
+	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+; Stage 4:
+	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	ENDP
+
+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+;  pay for increased branch mis-prediction to get here, but in practice it
+;  doesn't seem to slow anything down to take it out, and it's less code this
+;  way.
+ [ 0
+oc_idct8x8_6_v6 PROC
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	; Align the stack.
+	ADD	r0, r0, r13	; Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r8
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_6_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r0]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r8		; Write to the final destination.
+oc_idct8x8_6_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+	ENDP
+
+idct1core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	; Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+	ENDP
+
+idct3_2core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
+	; Stall
+	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_3core_stage3_v6
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_3_v6
+	DCD	12785 ; 31F1
+OC_C1S7_3_v6
+	DCD	64277 ; FB15
+OC_C6S2_3_v6
+	DCD	25080 ; 61F8
+OC_C2S6_3_v6
+	DCD	60547 ; EC83
+
+idct3_3core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_4core_down_stage3_v6
+	ENDP
+ ]
+
+idct4_3core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_3core_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
+	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]
+	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]
+	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]
+	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]
+	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_4_v6
+	DCD	12785 ; 31F1
+OC_C1S7_4_v6
+	DCD	64277 ; FB15
+OC_C6S2_4_v6
+	DCD	25080 ; 61F8
+OC_C2S6_4_v6
+	DCD	60547 ; EC83
+OC_C5S3_4_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_4_v6
+	DCD	54491 ; D4DB
+
+idct4_4core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_4core_down_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+	ENDP
+
+idct8_8core_v6 PROC
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_8_v6
+	DCD	12785 ; 31F1
+OC_C1S7_8_v6
+	DCD	64277 ; FB15
+OC_C6S2_8_v6
+	DCD	25080 ; 61F8
+OC_C2S6_8_v6
+	DCD	60547 ; EC83
+OC_C5S3_8_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_8_v6
+	DCD	54491 ; D4DB
+
+idct8_8core_down_v6 PROC
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_idct8x8_1_neon
+	EXPORT	oc_idct8x8_neon
+
+	ALIGN 16
+OC_IDCT_CONSTS_NEON
+	DCW	    8
+	DCW	64277 ; FB15 (C1S7)
+	DCW	60547 ; EC83 (C2S6)
+	DCW	54491 ; D4DB (C3S5)
+	DCW	46341 ; B505 (C4S4)
+	DCW	36410 ; 471D (C5S3)
+	DCW	25080 ; 30FC (C6S2)
+	DCW	12785 ; 31F1 (C7S1)
+
+oc_idct8x8_1_neon PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_neon PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	; Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2@128]
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3@128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	; 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	; Column transforms
+	BL	oc_idct8x8_stage123_neon
+	CMP	r0,r1
+	; We have to put the return address back in the LR, or the branch
+	;  predictor will not recognize the function return and mis-predict the
+	;  entire call stack.
+	MOV	r14, r12
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	BEQ		oc_idct8x8_slow_neon_noclear
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_slow_neon_noclear
+	VPOP		{D8-D15}
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_stage123_neon PROC
+; Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
+; Stage 3
+	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
+	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_10_neon PROC
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3@128]
+	MOV	r2, r1
+	; Row transforms (input is pre-transposed)
+; Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2@128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2@64], r12
+	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2@64]
+	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
+	VADD.S16	D4, D4, D18	; D4 = t[7]
+	VADD.S16	D6, D6, D19	; D6 = t[6]
+	VADD.S16	D7, D7, D19	; D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	; D30= t[0]
+					; D31= t[3]
+; Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
+					; D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
+					; D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
+					;       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
+					;       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
+	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
+	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
+	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
+; Stage 4
+	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
+					; D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
+					; D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
+					; D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
+					; D19= y[3]=t[3]'-t[4]''
+	; 8x4 transpose
+	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
+					; Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
+					; Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
+					; Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
+					; Q10= c7c6c5c4 c3c2c1c0
+	; Column transforms
+; Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
+					;           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
+					;           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
+	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
+	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
+	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
+	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
+; Stage 4
+	CMP	r0, r1
+	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
+	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
+	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
+	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
+	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
+	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
+	BEQ	oc_idct8x8_10_neon_noclear
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1@64]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_10_neon_noclear
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	ENDP
+ ]
+
+	END
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armint.h
@@ -0,0 +1,126 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+#  if defined(__ARMEB__)
+#   error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+#  endif
+
+#  define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+   of the things that depend on structure offsets.
+  We reuse the function pointer with the wrong prototype, though.*/
+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+  ((oc_loop_filter_frag_rows_arm_func) \
+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+   (_bv), \
+   (_state)->frags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset, \
+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+   (_state)->frag_buf_offs, \
+   (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#  if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+#   if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#    if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armloop.s
@@ -0,0 +1,676 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG	*	1
+
+	; Vanilla ARM v4 version
+loop_filter_h_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp
+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
+	LDRB	r12,[r0, #1]		; r12= _pix[3]
+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+loop_filter_v_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp
+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
+	LDRB	r12,[r0, r1]		; r12= _pix[3]
+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_arm PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	; _bv += 127
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_loop_filter_init_v6
+	EXPORT	oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6 PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+	ENDP
+
+; We could use the same strategy as the v filter below, but that would require
+;  40 instructions to load the data and transpose it into columns and another
+;  32 to write out the results at the end, plus the 52 instructions to do the
+;  filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+;  52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+;  proposed for FFmpeg, but not by much:
+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+;  of four.
+loop_filter_h_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+loop_filter_h_core_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; r12= 0x10003
+	; Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
+	; Single issue
+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
+	; Single issue
+	; There's no min, max or abs instruction.
+	; SSUB8 and SEL will work for abs, and we can do all the rest with
+	;  unsigned saturated adds, which means the GE flags are still all
+	;  set when we're done computing lflim(abs(R_i),L).
+	; This allows us to both add and subtract, and split the results by
+	;  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	; Single issue
+	SEL	r7, r7, r6		; r7 = abs(R_i)
+	; Single issue
+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
+	; Single issue
+	UQADD8	r7, r7, r4
+	; Single issue
+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	; Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		; r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		; r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		; r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		; r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		; r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		; r4 = q1
+	STRB	r4, [r0,#-1]
+	; Single issue
+	STRB	r9, [r0,-r1]!
+	; Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+	ENDP
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+;  filter value, f:
+;   u = ~UHADD8(p1,~p2);
+;   v = UHADD8(~p1,p2);
+;   m = v-u;
+;   a = m^UHADD8(m^p0,m^~p3);
+;   f = UHADD8(UHADD8(a,u1),v1);
+;  where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+;  as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+;  but requires more code, because it does all eight columns at once, instead
+;  of four at a time.
+loop_filter_v_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
+	MVN	r14,r6			; r14= ~p1
+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
+	; Filter the first four columns.
+	MVN	r12,r8			; r12= ~p2
+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
+	MVN	r10, r10		; r10=~p3
+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = m1^p0
+	EOR	r10,r10,r14		; r10= m1^~p3
+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		; r14= v1=m1+u1
+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
+	MVN	r12,r9			; r12= ~p6
+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
+	; Filter the second four columns.
+	MVN	r14,r7			; r14= ~p5
+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			; r11=~p7
+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
+	; Single issue
+	EOR	r5, r5, r10		; r5 = m2^p4
+	EOR	r11,r11,r10		; r11= m2^~p7
+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
+	; Single issue
+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	; Single issue
+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
+	; Now split f[i] by sign.
+	; There's no min or max instruction.
+	; We could use SSUB8 and SEL, but this is just as many instructions and
+	;  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_v6 PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	; ll = *(int *)_bv
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		; if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_loop_filter_init_neon
+	EXPORT	oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  ; r1 = 2*L
+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0@128]
+	MOV	PC,r14
+	ENDP
+
+loop_filter_h_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	; Doing a 2-element structure load saves doing two VTRN's below, at the
+	;  cost of using two more slower single-lane loads vs. the faster
+	;  all-lane loads.
+	; It's less code this way, though, and benches a hair faster, but it
+	;  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
+						; D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
+						; D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
+						; D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
+						; D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
+						; D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
+						; D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
+						; D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
+						; D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+	ENDP
+
+loop_filter_v_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12@64], r1		; D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12@64], r1		; D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12@64], r1		; D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12@64]			; D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12@64], r1
+	VST1.64	{D4}, [r12@64], r1
+	MOV	PC,r14
+	ENDP
+
+oc_loop_filter_frag_rows_neon PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	;		  bail
+	VLD1.64	{D30,D31}, [r2@128]	; Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+	END
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armopts.s
@@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	1
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	1
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	1
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END
new file mode 100644
--- /dev/null
+++ b/media/libtheora/lib/arm/armstate.c
@@ -0,0 +1,219 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
+  /*Note: We _must_ set this function pointer, because the macro in armint.h
+     calls it with different arguments, so the C version will segfault.*/
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+#    endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+  }
+#   endif
+#  endif
+# endif
+}
+
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+#  endif
+# endif
+
+#endif
--- a/media/libtheora/lib/bitpack.c
+++ b/media/libtheora/lib/bitpack.c
@@ -6,17 +6,17 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function: packing variable sized words into an octet stream
-  last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $
 
  ********************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "bitpack.h"
 
 /*We're 'MSb' endian; if we write a word but read individual bits,
    then we'll read the MSb first.*/
@@ -27,25 +27,28 @@ void oc_pack_readinit(oc_pack_buf *_b,un
   _b->stop=_buf+_bytes;
 }
 
 static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
   const unsigned char *ptr;
   const unsigned char *stop;
   oc_pb_window         window;
   int                  available;
+  unsigned             shift;
+  stop=_b->stop;
+  ptr=_b->ptr;
   window=_b->window;
   available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+  shift=OC_PB_WINDOW_SIZE-available;
+  while(7<shift&&ptr<stop){
+    shift-=8;
+    window|=(oc_pb_window)*ptr++<<shift;
   }
   _b->ptr=ptr;
+  available=OC_PB_WINDOW_SIZE-shift;
   if(_bits>available){
     if(ptr>=stop){
       _b->eof=1;
       available=OC_LOTS_OF_BITS;
     }
     else window|=*ptr>>(available&7);
   }
   _b->bits=available;
@@ -62,50 +65,50 @@ int oc_pack_look1(oc_pack_buf *_b){
 }
 
 void oc_pack_adv1(oc_pack_buf *_b){
   _b->window<<=1;
   _b->bits--;
 }
 
 /*Here we assume that 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits){
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
   oc_pb_window window;
   int          available;
   long         result;
   window=_b->window;
   available=_b->bits;
   if(_bits==0)return 0;
   if(available<_bits){
     window=oc_pack_refill(_b,_bits);
     available=_b->bits;
   }
   result=window>>OC_PB_WINDOW_SIZE-_bits;
   available-=_bits;
   window<<=1;
   window<<=_bits-1;
+  _b->window=window;
   _b->bits=available;
-  _b->window=window;
   return result;
 }
 
-int oc_pack_read1(oc_pack_buf *_b){
+int oc_pack_read1_c(oc_pack_buf *_b){
   oc_pb_window window;
   int          available;
   int          result;
   window=_b->window;
   available=_b->bits;
   if(available<1){
     window=oc_pack_refill(_b,1);
     available=_b->bits;
   }
   result=window>>OC_PB_WINDOW_SIZE-1;
   available--;
   window<<=1;
+  _b->window=window;
   _b->bits=available;
-  _b->window=window;
   return result;
 }
 
 long oc_pack_bytes_left(oc_pack_buf *_b){
   if(_b->eof)return -1;
   return _b->stop-_b->ptr+(_b->bits>>3);
 }
--- a/media/libtheora/lib/bitpack.h
+++ b/media/libtheora/lib/bitpack.h
@@ -11,47 +11,64 @@
  ********************************************************************
 
   function: packing variable sized words into an octet stream
   last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
 
  ********************************************************************/
 #if !defined(_bitpack_H)
 # define _bitpack_H (1)
+# include <stddef.h>
 # include <limits.h>
+# include "internal.h"
+
+
+
+typedef size_t             oc_pb_window;
+typedef struct oc_pack_buf oc_pack_buf;
 
 
 
-typedef unsigned long      oc_pb_window;
-typedef struct oc_pack_buf oc_pack_buf;
+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+#  include "arm/armbits.h"
+# endif
 
-
+# if !defined(oc_pack_read)
+#  define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+#  define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+#  define oc_huff_token_decode oc_huff_token_decode_c
+# endif
 
 # define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
 /*This is meant to be a large, positive constant that can still be efficiently
    loaded as an immediate (on platforms like ARM, for example).
   Even relatively modest values like 100 would work fine.*/
 # define OC_LOTS_OF_BITS (0x40000000)
 
 
 
 struct oc_pack_buf{
-  oc_pb_window         window;
+  const unsigned char *stop;
   const unsigned char *ptr;
-  const unsigned char *stop;
+  oc_pb_window         window;
   int                  bits;
   int                  eof;
 };
 
 void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
 int oc_pack_look1(oc_pack_buf *_b);
 void oc_pack_adv1(oc_pack_buf *_b);
 /*Here we assume 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits);
-int oc_pack_read1(oc_pack_buf *_b);
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
 /* returns -1 for read beyond EOF, or the number of whole bytes available */
 long oc_pack_bytes_left(oc_pack_buf *_b);
 
 /*These two functions are implemented locally in huffdec.c*/
 /*Read in bits without advancing the bitptr.
   Here we assume 0<=_bits&&_bits<=32.*/
 /*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/
 /*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/
--- a/media/libtheora/lib/config.h
+++ b/media/libtheora/lib/config.h
@@ -27,64 +27,72 @@
 
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the <sys/soundcard.h> header file. */
-/* #undef HAVE_SYS_SOUNDCARD_H */
+#define HAVE_SYS_SOUNDCARD_H 1
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
 /* #undef NO_MINUS_C_MINUS_O */
 
+/* make use of arm asm optimization */
+ 
+
+/* Define if assembler supports EDSP instructions */
+
+
+/* Define if assembler supports ARMv6 media instructions */
+
+
+/* Define if compiler supports NEON instructions */
+
+
+/* make use of c64x+ asm optimization */
+/* #undef OC_C64X_ASM */
+
 /* make use of x86_64 asm optimization */
 /* #undef OC_X86_64_ASM */
 
 /* make use of x86 asm optimization */
- /**/
+/* #undef OC_X86_ASM */
 
 /* Name of package */
 #define PACKAGE "libtheora"
 
 /* Define to the address where bug reports for this package should be sent. */
 #define PACKAGE_BUGREPORT ""
 
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "libtheora"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libtheora 1.1.1+svn"
+#define PACKAGE_STRING "libtheora 1.2.0alpha1+svn"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libtheora"
 
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.1.1+svn"
+#define PACKAGE_VERSION "1.2.0alpha1+svn"
 
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
 
 /* Define to exclude encode support from the build */
 /* #undef THEORA_DISABLE_ENCODE */
 
 /* Define to exclude floating point code from the build */
 /* #undef THEORA_DISABLE_FLOAT */
 
 /* Version number of package */
-#define VERSION "1.1.1+svn"
+#define VERSION "1.2.0alpha1+svn"
deleted file mode 100644
--- a/media/libtheora/lib/cpu.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
- CPU capability detection for x86 processors.
-  Originally written by Rudolf Marek.
-
- function:
-  last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-#include "cpu.h"
-
-#if !defined(OC_X86_ASM)
-static ogg_uint32_t oc_cpu_flags_get(void){
-  return 0;
-}
-#else
-# if !defined(_MSC_VER)
-#  if defined(__amd64__)||defined(__x86_64__)
-/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
-   compiling with -fPIC.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "cpuid\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  else
-/*On x86-32, not so much.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   "cpuid\n\t" \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  endif
-# else
-/*Why does MSVC need this complicated rigamarole?
-  At this point I honestly do not care.*/
-
-/*Visual C cpuid helper function.
-  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
-   for VS2003 users, so we do it in inline assembler.*/
-static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
-  _asm{
-    mov eax,[_op]
-    mov esi,_cpu_info
-    cpuid
-    mov [esi+0],eax
-    mov [esi+4],ebx
-    mov [esi+8],ecx
-    mov [esi+12],edx
-  }
-}
-
-#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  do{ \
-    ogg_uint32_t cpu_info[4]; \
-    oc_cpuid_helper(cpu_info,_op); \
-    (_eax)=cpu_info[0]; \
-    (_ebx)=cpu_info[1]; \
-    (_ecx)=cpu_info[2]; \
-    (_edx)=cpu_info[3]; \
-  }while(0)
-
-static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
-  _asm{
-    pushfd
-    pushfd
-    pop eax
-    mov ebx,eax
-    xor eax,200000h
-    push eax
-    popfd
-    pushfd
-    pop eax
-    popfd
-    mov ecx,_eax
-    mov [ecx],eax
-    mov ecx,_ebx
-    mov [ecx],ebx
-  }
-}
-# endif
-
-static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
-  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
-  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
-  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
-  return flags;
-}
-
-static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
-  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
-  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
-  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
-  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
-  return flags;
-}
-
-static ogg_uint32_t oc_cpu_flags_get(void){
-  ogg_uint32_t flags;
-  ogg_uint32_t eax;
-  ogg_uint32_t ebx;
-  ogg_uint32_t ecx;
-  ogg_uint32_t edx;
-# if !defined(__amd64__)&&!defined(__x86_64__)
-  /*Not all x86-32 chips support cpuid, so we have to check.*/
-#  if !defined(_MSC_VER)
-  __asm__ __volatile__(
-   "pushfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "movl %[a],%[b]\n\t"
-   "xorl $0x200000,%[a]\n\t"
-   "pushl %[a]\n\t"
-   "popfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "popfl\n\t"
-   :[a]"=r"(eax),[b]"=r"(ebx)
-   :
-   :"cc"
-  );
-#  else
-  oc_detect_cpuid_helper(&eax,&ebx);
-#  endif
-  /*No cpuid.*/
-  if(eax==ebx)return 0;
-# endif
-  cpuid(0,eax,ebx,ecx,edx);
-  /*         l e t n          I e n i          u n e G*/
-  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
-   /*      6 8 x M          T e n i          u n e G*/
-   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
-    /*Intel, Transmeta (tested with Crusoe TM5800):*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-  }
-  /*              D M A c          i t n e          h t u A*/
-  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
-   /*      C S N            y b   e          d o e G*/
-   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
-    /*AMD, Geode:*/
-    cpuid(0x80000000,eax,ebx,ecx,edx);
-    if(eax<0x80000001)flags=0;
-    else{
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      flags=oc_parse_amd_flags(edx,ecx);
-    }
-    /*Also check for SSE.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags|=oc_parse_intel_flags(edx,ecx);
-  }
-  /*Technically some VIA chips can be configured in the BIOS to return any
-     string here the user wants.
-    There is a special detection method that can be used to identify such
-     processors, but in my opinion, if the user really wants to change it, they
-     deserve what they get.*/
-  /*              s l u a          H r u a          t n e C*/
-  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
-    /*VIA:*/
-    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
-       chips (thanks to the engineers from Centaur Technology who provided it).
-      These chips support Intel-like cpuid info.
-      The C3-2 (Nehemiah) cores appear to, as well.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-    if(eax>=0x80000001){
-      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
-        We need to check this even if the Intel test succeeds to pick up 3DNow!
-         support on these processors.
-        Unlike actual AMD processors, we cannot _rely_ on this info, since
-         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
-         this function, yet return edx=0, despite the Intel test indicating
-         MMX support.
-        Therefore the features detected here are strictly added to those
-         detected by the Intel test.*/
-      /*TODO: How about earlier chips?*/
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      /*Note: As of the C7, this function returns Intel-style extended feature
-         flags, not AMD-style.
-        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
-         do not conflict with any of the AMD flags we inspect.
-        For the remaining bits, Intel tells us, "Do not count on their value",
-         but VIA assures us that they will all be zero (at least on the C7 and
-         Isaiah chips).
-        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
-         (0xC0C00000) for something else, we will have to add code to detect
-         the model to decide when it is appropriate to inspect them.*/
-      flags|=oc_parse_amd_flags(edx,ecx);
-    }
-  }
-  else{
-    /*Implement me.*/
-    flags=0;
-  }
-  return flags;
-}
-#endif
deleted file mode 100644
--- a/media/libtheora/lib/cpu.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
- function:
-    last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "internal.h"
-
-#define OC_CPU_X86_MMX      (1<<0)
-#define OC_CPU_X86_3DNOW    (1<<1)
-#define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT   (1<<3)
-#define OC_CPU_X86_SSE      (1<<4)
-#define OC_CPU_X86_SSE2     (1<<5)
-#define OC_CPU_X86_PNI      (1<<6)
-#define OC_CPU_X86_SSSE3    (1<<7)
-#define OC_CPU_X86_SSE4_1   (1<<8)
-#define OC_CPU_X86_SSE4_2   (1<<9)
-#define OC_CPU_X86_SSE4A    (1<<10)
-#define OC_CPU_X86_SSE5     (1<<11)
-
-#endif
--- a/media/libtheora/lib/decinfo.c
+++ b/media/libtheora/lib/decinfo.c
@@ -6,17 +6,17 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $
 
  ********************************************************************/
 
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "decint.h"
 
@@ -123,16 +123,20 @@ static int oc_comment_unpack(oc_pack_buf
   if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){
     _tc->comments=0;
     return TH_EBADHEADER;
   }
   _tc->comment_lengths=(int *)_ogg_malloc(
    _tc->comments*sizeof(_tc->comment_lengths[0]));
   _tc->user_comments=(char **)_ogg_malloc(
    _tc->comments*sizeof(_tc->user_comments[0]));
+  if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
+    _tc->comments=0;
+    return TH_EFAULT;
+  }
   for(i=0;i<_tc->comments;i++){
     len=oc_unpack_length(_opb);
     if(len<0||len>oc_pack_bytes_left(_opb)){
       _tc->comments=i;
       return TH_EBADHEADER;
     }
     _tc->comment_lengths[i]=len;
     _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
--- a/media/libtheora/lib/decint.h
+++ b/media/libtheora/lib/decint.h
@@ -6,102 +6,181 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $
 
  ********************************************************************/
 
 #include <limits.h>
 #if !defined(_decint_H)
 # define _decint_H (1)
 # include "theora/theoradec.h"
-# include "internal.h"
+# include "state.h"
 # include "bitpack.h"
-
-typedef struct th_setup_info oc_setup_info;
-typedef struct th_dec_ctx    oc_dec_ctx;
-
 # include "huffdec.h"
 # include "dequant.h"
 
+typedef struct th_setup_info         oc_setup_info;
+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct th_dec_ctx            oc_dec_ctx;
+
+
+
+/*Decoder-specific accelerated functions.*/
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xdec.h"
+# endif
+
+# if !defined(oc_dec_accel_init)
+#  define oc_dec_accel_init oc_dec_accel_init_c
+# endif
+# if defined(OC_DEC_USE_VTABLE)
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
+#  endif
+# else
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
+#  endif
+# endif
+
+
+
 /*Constants for the packet-in state machine specific to the decoder.*/
 
 /*Next packet to read: Data packet.*/
 #define OC_PACKET_DATA (0)
 
 
 
 struct th_setup_info{
   /*The Huffman codes.*/
-  oc_huff_node      *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
   /*The quantization parameters.*/
   th_quant_info  qinfo;
 };
 
 
 
+/*Decoder specific functions with accelerated variants.*/
+struct oc_dec_opt_vtable{
+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
+   oc_dec_pipeline_state *_pipe,int _pli);
+};
+
+
+
+struct oc_dec_pipeline_state{
+  /*Decoded DCT coefficients.
+    These are placed here instead of on the stack so that they can persist
+     between blocks, which makes clearing them back to zero much faster when
+     only a few non-zero coefficients were decoded.
+    It requires at least 65 elements because the zig-zag index array uses the
+     65th element as a dumping ground for out-of-range indices to protect us
+     from buffer overflow.
+    We make it fully twice as large so that the second half can serve as the
+     reconstruction buffer, which saves passing another parameter to all the
+     acceleration functios.
+    It also solves problems with 16-byte alignment for NEON on ARM.
+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+     alignment, and silently produces incorrect results if you ask for 16.
+    Finally, keeping it off the stack means there's less likely to be a data
+     hazard beween the NEON co-processor and the regular ARM core, which avoids
+     unnecessary stalls.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][4];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+};
+
+
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
-  oc_theora_state      state;
+  oc_theora_state        state;
   /*Whether or not packets are ready to be emitted.
     This takes on negative values while there are remaining header packets to
      be emitted, reaches 0 when the codec is ready for input, and goes to 1
      when a frame has been processed and a data packet is ready.*/
-  int                  packet_state;
+  int                    packet_state;
   /*Buffer in which to assemble packets.*/
-  oc_pack_buf          opb;
+  oc_pack_buf            opb;
   /*Huffman decode trees.*/
-  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of the first token in each plane for each coefficient.*/
-  ptrdiff_t            ti0[3][64];
+  ptrdiff_t              ti0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
-  ptrdiff_t            eob_runs[3][64];
+  ptrdiff_t              eob_runs[3][64];
   /*The DCT token lists.*/
-  unsigned char       *dct_tokens;
+  unsigned char         *dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  unsigned char       *extra_bits;
+  unsigned char         *extra_bits;
   /*The number of dct tokens unpacked so far.*/
-  int                  dct_tokens_count;
+  int                    dct_tokens_count;
   /*The out-of-loop post-processing level.*/
-  int                  pp_level;
+  int                    pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
-  int                  pp_dc_scale[64];
+  int                    pp_dc_scale[64];
   /*The sharpen modifier used for out-of-loop deringing.*/
-  int                  pp_sharp_mod[64];
+  int                    pp_sharp_mod[64];
   /*The DC quantization index of each block.*/
-  unsigned char       *dc_qis;
+  unsigned char         *dc_qis;
   /*The variance of each block.*/
-  int                 *variances;
+  int                   *variances;
   /*The storage for the post-processed frame buffer.*/
-  unsigned char       *pp_frame_data;
+  unsigned char         *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                  pp_frame_state;
+  int                    pp_frame_state;
   /*The buffer used for the post-processed frame.
     Note that this is _not_ guaranteed to have the same strides and offsets as
      the reference frame buffers.*/
-  th_ycbcr_buffer      pp_frame_buf;
+  th_ycbcr_buffer        pp_frame_buf;
   /*The striped decode callback function.*/
-  th_stripe_callback   stripe_cb;
+  th_stripe_callback     stripe_cb;
+  oc_dec_pipeline_state  pipe;
+# if defined(OC_DEC_USE_VTABLE)
+  /*Table for decoder acceleration functions.*/
+  oc_dec_opt_vtable      opt_vtable;
+# endif
 # if defined(HAVE_CAIRO)
   /*Output metrics for debugging.*/
-  int                  telemetry;
-  int                  telemetry_mbmode;
-  int                  telemetry_mv;
-  int                  telemetry_qi;
-  int                  telemetry_bits;
-  int                  telemetry_frame_bytes;
-  int                  telemetry_coding_bytes;
-  int                  telemetry_mode_bytes;
-  int                  telemetry_mv_bytes;
-  int                  telemetry_qi_bytes;
-  int                  telemetry_dc_bytes;
-  unsigned char       *telemetry_frame_data;
+  int                    telemetry;
+  int                    telemetry_mbmode;
+  int                    telemetry_mv;
+  int                    telemetry_qi;
+  int                    telemetry_bits;
+  int                    telemetry_frame_bytes;
+  int                    telemetry_coding_bytes;
+  int                    telemetry_mode_bytes;
+  int                    telemetry_mv_bytes;
+  int                    telemetry_qi_bytes;
+  int                    telemetry_dc_bytes;
+  unsigned char         *telemetry_frame_data;
 # endif
 };
 
+/*Default pure-C implementations of decoder-specific accelerated functions.*/
+void oc_dec_accel_init_c(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
 #endif
--- a/media/libtheora/lib/decode.c
+++ b/media/libtheora/lib/decode.c
@@ -6,17 +6,17 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: decode.c 16581 2009-09-25 22:56:16Z gmaxwell $
+    last mod: $Id: decode.c 17576 2010-10-29 01:07:51Z tterribe $
 
  ********************************************************************/
 
 #include <stdlib.h>
 #include <string.h>
 #include <ogg/ogg.h>
 #include "decint.h"
 #if defined(OC_DUMP_IMAGES)
@@ -113,28 +113,28 @@ static const unsigned char OC_MODE_ALPHA
 /*The number of additional extra bits that are decoded with each of the
    internal DCT tokens.*/
 static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
   12,4,3,3,4,4,5,5,8,8,8,8,3,3,6
 };
 
 /*Whether or not an internal token needs any additional extra bits.*/
 #define OC_DCT_TOKEN_NEEDS_MORE(token) \
- (token<(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
+ (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
   sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
 
 /*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
 #define OC_DCT_TOKEN_FAT_EOB (0)
 
 /*The number of EOBs to use for an end-of-frame token.
   Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99, which
    is not yet available everywhere; this should be equivalent.*/
 #define OC_DCT_EOB_FINISH (~(size_t)0>>1)
 
-/*The location of the (6) run legth bits in the code word.
+/*The location of the (6) run length bits in the code word.
   These are placed at index 0 and given 8 bits (even though 6 would suffice)
    because it may be faster to extract the lower byte on some platforms.*/
 #define OC_DCT_CW_RLEN_SHIFT (0)
 /*The location of the (12) EOB bits in the code word.*/
 #define OC_DCT_CW_EOB_SHIFT  (8)
 /*The location of the (1) flip bit in the code word.
   This must be right under the magnitude bits.*/
 #define OC_DCT_CW_FLIP_BIT   (20)
@@ -292,91 +292,94 @@ static const ogg_int32_t OC_DCT_CODE_WOR
   OC_DCT_CW_PACK( 5, 0,  0,0),
   OC_DCT_CW_PACK( 6, 0,  0,0),
   OC_DCT_CW_PACK( 7, 0,  0,0),
 };
 
 
 
 static int oc_sb_run_unpack(oc_pack_buf *_opb){
-  long bits;
-  int ret;
   /*Coding scheme:
        Codeword            Run Length
      0                       1
      10x                     2-3
      110x                    4-5
      1110xx                  6-9
      11110xxx                10-17
      111110xxxx              18-33
      111111xxxxxxxxxxxx      34-4129*/
-  bits=oc_pack_read1(_opb);
-  if(bits==0)return 1;
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 2+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read1(_opb);
-    return 4+(int)bits;
+  static const ogg_int16_t OC_SB_RUN_TREE[22]={
+    4,
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3),
+     -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17,
+      2,
+       -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6)
+  };
+  int ret;
+  ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE);
+  if(ret>=0x10){
+    int offs;
+    offs=ret&0x1F;
+    ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4);
   }
-  bits=oc_pack_read(_opb,3);
-  if((bits&4)==0)return 6+(int)bits;
-  else if((bits&2)==0){
-    ret=10+((bits&1)<<2);
-    bits=oc_pack_read(_opb,2);
-    return ret+(int)bits;
-  }
-  else if((bits&1)==0){
-    bits=oc_pack_read(_opb,4);
-    return 18+(int)bits;
-  }
-  bits=oc_pack_read(_opb,12);
-  return 34+(int)bits;
+  return ret;
 }
 
 static int oc_block_run_unpack(oc_pack_buf *_opb){
-  long bits;
-  long bits2;
   /*Coding scheme:
      Codeword             Run Length
      0x                      1-2
      10x                     3-4
      110x                    5-6
      1110xx                  7-10
      11110xx                 11-14
      11111xxxx               15-30*/
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 1+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read1(_opb);
-    return 3+(int)bits;
-  }
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 5+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read(_opb,2);
-    return 7+(int)bits;
-  }
-  bits=oc_pack_read(_opb,3);
-  if((bits&4)==0)return 11+bits;
-  bits2=oc_pack_read(_opb,2);
-  return 15+((bits&3)<<2)+bits2;
+  static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={
+    5,
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3),
+     -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4),
+     -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6),
+     33,       36,       39,       44,
+      1,-(1<<8|7),-(1<<8|8),
+      1,-(1<<8|9),-(1<<8|10),
+      2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14),
+      4,
+       -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18),
+       -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22),
+       -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26),
+       -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30)
+  };
+  return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE);
 }
 
 
 
+void oc_dec_accel_init_c(oc_dec_ctx *_dec){
+# if defined(OC_DEC_USE_VTABLE)
+  _dec->opt_vtable.dc_unpredict_mcu_plane=
+   oc_dec_dc_unpredict_mcu_plane_c;
+# endif
+}
+
 static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
  const th_setup_info *_setup){
   int qti;
   int pli;
   int qi;
   int ret;
   ret=oc_state_init(&_dec->state,_info,3);
   if(ret<0)return ret;
   ret=oc_huff_trees_copy(_dec->huff_tables,
-   (const oc_huff_node *const *)_setup->huff_tables);
+   (const ogg_int16_t *const *)_setup->huff_tables);
   if(ret<0){
     oc_state_clear(&_dec->state);
     return ret;
   }
   /*For each fragment, allocate one byte for every DCT coefficient token, plus
      one byte for extra-bits for each token, plus one more byte for the long
      EOB run, just in case it's the very last token and has a run length of
      one.*/
@@ -401,16 +404,17 @@ static int oc_dec_init(oc_dec_ctx *_dec,
        _dec->state.dequant_tables[qti][pli][qi][17]+
        _dec->state.dequant_tables[qti][pli][qi][18]+
        _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0);
     }
     _dec->pp_sharp_mod[qi]=-(qsum>>11);
   }
   memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
    sizeof(_dec->state.loop_filter_limits));
+  oc_dec_accel_init(_dec);
   _dec->pp_level=OC_PP_LEVEL_DISABLED;
   _dec->dc_qis=NULL;
   _dec->variances=NULL;
   _dec->pp_frame_data=NULL;
   _dec->stripe_cb.ctx=NULL;
   _dec->stripe_cb.stripe_decoded=NULL;
 #if defined(HAVE_CAIRO)
   _dec->telemetry=0;
@@ -499,16 +503,17 @@ static void oc_dec_mark_all_intra(oc_dec
       int quadi;
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
         int bi;
         for(bi=0;bi<4;bi++){
           ptrdiff_t fragi;
           fragi=sb_maps[sbi][quadi][bi];
           if(fragi>=0){
             frags[fragi].coded=1;
+            frags[fragi].refi=OC_FRAME_SELF;
             frags[fragi].mb_mode=OC_MODE_INTRA;
             coded_fragis[ncoded_fragis++]=fragi;
           }
         }
       }
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
     prev_ncoded_fragis=ncoded_fragis;
@@ -590,16 +595,17 @@ static void oc_dec_coded_sb_flags_unpack
   while(sbi<nsbs);
   /*TODO: run_count should be 0 here.
     If it's not, we should issue a warning of some kind.*/
 }
 
 static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
   const oc_sb_map   *sb_maps;
   const oc_sb_flags *sb_flags;
+  signed char       *mb_modes;
   oc_fragment       *frags;
   unsigned           nsbs;
   unsigned           sbi;
   unsigned           npartial;
   long               val;
   int                pli;
   int                flag;
   int                run_count;
@@ -612,27 +618,30 @@ static void oc_dec_coded_flags_unpack(oc
   if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
   if(npartial>0){
     val=oc_pack_read1(&_dec->opb);
     flag=!(int)val;
   }
   else flag=0;
   sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
   sb_flags=_dec->state.sb_flags;
+  mb_modes=_dec->state.mb_modes;
   frags=_dec->state.frags;
   sbi=nsbs=run_count=0;
   coded_fragis=_dec->state.coded_fragis;
   uncoded_fragis=coded_fragis+_dec->state.nfrags;
   prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
   for(pli=0;pli<3;pli++){
     nsbs+=_dec->state.fplanes[pli].nsbs;
     for(;sbi<nsbs;sbi++){
       int quadi;
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int quad_coded;
         int bi;
+        quad_coded=0;
         for(bi=0;bi<4;bi++){
           ptrdiff_t fragi;
           fragi=sb_maps[sbi][quadi][bi];
           if(fragi>=0){
             int coded;
             if(sb_flags[sbi].coded_fully)coded=1;
             else if(!sb_flags[sbi].coded_partially)coded=0;
             else{
@@ -640,57 +649,67 @@ static void oc_dec_coded_flags_unpack(oc
                 run_count=oc_block_run_unpack(&_dec->opb);
                 flag=!flag;
               }
               run_count--;
               coded=flag;
             }
             if(coded)coded_fragis[ncoded_fragis++]=fragi;
             else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            quad_coded|=coded;
             frags[fragi].coded=coded;
+            frags[fragi].refi=OC_FRAME_NONE;
           }
         }
+        /*Remember if there's a coded luma block in this macro block.*/
+        if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
       }
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
     prev_ncoded_fragis=ncoded_fragis;
   }
   _dec->state.ntotal_coded_fragis=ncoded_fragis;
   /*TODO: run_count should be 0 here.
     If it's not, we should issue a warning of some kind.*/
 }
 
 
-
-typedef int (*oc_mode_unpack_func)(oc_pack_buf *_opb);
+/*Coding scheme:
+   Codeword            Mode Index
+   0                       0
+   10                      1
+   110                     2
+   1110                    3
+   11110                   4
+   111110                  5
+   1111110                 6
+   1111111                 7*/
+static const ogg_int16_t OC_VLC_MODE_TREE[26]={
+  4,
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+   -(3<<8|2),-(3<<8|2),-(4<<8|3),17,
+    3,
+     -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4),
+     -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7)
+};
 
-static int oc_vlc_mode_unpack(oc_pack_buf *_opb){
-  long val;
-  int  i;
-  for(i=0;i<7;i++){
-    val=oc_pack_read1(_opb);
-    if(!val)break;
-  }
-  return i;
-}
-
-static int oc_clc_mode_unpack(oc_pack_buf *_opb){
-  long val;
-  val=oc_pack_read(_opb,3);
-  return (int)val;
-}
+static const ogg_int16_t OC_CLC_MODE_TREE[9]={
+  3,
+   -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3),
+   -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7)
+};
 
 /*Unpacks the list of macro block modes for INTER frames.*/
 static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
-  const oc_mb_map     *mb_maps;
   signed char         *mb_modes;
-  const oc_fragment   *frags;
   const unsigned char *alphabet;
   unsigned char        scheme0_alphabet[8];
-  oc_mode_unpack_func  mode_unpack;
+  const ogg_int16_t   *mode_tree;
   size_t               nmbs;
   size_t               mbi;
   long                 val;
   int                  mode_scheme;
   val=oc_pack_read(&_dec->opb,3);
   mode_scheme=(int)val;
   if(mode_scheme==0){
     int mi;
@@ -702,182 +721,185 @@ static void oc_dec_mb_modes_unpack(oc_de
     for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
     for(mi=0;mi<OC_NMODES;mi++){
       val=oc_pack_read(&_dec->opb,3);
       scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
     }
     alphabet=scheme0_alphabet;
   }
   else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
-  if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
-  else mode_unpack=oc_vlc_mode_unpack;
+  mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
   mb_modes=_dec->state.mb_modes;
-  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   nmbs=_dec->state.nmbs;
-  frags=_dec->state.frags;
   for(mbi=0;mbi<nmbs;mbi++){
-    if(mb_modes[mbi]!=OC_MODE_INVALID){
-      int bi;
-      /*Check for a coded luma block in this macro block.*/
-      for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
-      /*We found one, decode a mode.*/
-      if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)];
-      /*There were none: INTER_NOMV is forced.*/
-      else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+    if(mb_modes[mbi]>0){
+      /*We have a coded luma block; decode a mode.*/
+      mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
     }
+    /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
+       fact that OC_MODE_INTER_NOMV is already 0.*/
   }
 }
 
 
 
-typedef int (*oc_mv_comp_unpack_func)(oc_pack_buf *_opb);
+static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={
+  5,
+   -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),
+   -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),
+   -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),
+   -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2),
+   -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3),
+   33,          36,          39,          42,
+   45,          50,          55,          60,
+   65,          74,          83,          92,
+    1,-(1<<8|32+4),-(1<<8|32-4),
+    1,-(1<<8|32+5),-(1<<8|32-5),
+    1,-(1<<8|32+6),-(1<<8|32-6),
+    1,-(1<<8|32+7),-(1<<8|32-7),
+    2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9),
+    2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11),
+    2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13),
+    2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15),
+    3,
+     -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17),
+     -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19),
+    3,
+     -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21),
+     -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23),
+    3,
+     -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25),
+     -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27),
+    3,
+     -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29),
+     -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31)
+};
 
-static int oc_vlc_mv_comp_unpack(oc_pack_buf *_opb){
-  long bits;
-  int  mask;
-  int  mv;
-  bits=oc_pack_read(_opb,3);
-  switch(bits){
-    case  0:return 0;
-    case  1:return 1;
-    case  2:return -1;
-    case  3:
-    case  4:{
-      mv=(int)(bits-1);
-      bits=oc_pack_read1(_opb);
-    }break;
-    /*case  5:
-    case  6:
-    case  7:*/
-    default:{
-      mv=1<<bits-3;
-      bits=oc_pack_read(_opb,bits-2);
-      mv+=(int)(bits>>1);
-      bits&=1;
-    }break;
-  }
-  mask=-(int)bits;
-  return mv+mask^mask;
-}
+static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={
+  6,
+   -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1),
+   -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3),
+   -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5),
+   -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7),
+   -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9),
+   -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11),
+   -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13),
+   -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15),
+   -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17),
+   -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19),
+   -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21),
+   -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23),
+   -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25),
+   -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27),
+   -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29),
+   -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31)
+};
 
-static int oc_clc_mv_comp_unpack(oc_pack_buf *_opb){
-  long bits;
-  int  mask;
-  int  mv;
-  bits=oc_pack_read(_opb,6);
-  mv=(int)bits>>1;
-  mask=-((int)bits&1);
-  return mv+mask^mask;
+
+static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  int dx;
+  int dy;
+  dx=oc_huff_token_decode(_opb,_tree)-32;
+  dy=oc_huff_token_decode(_opb,_tree)-32;
+  return OC_MV(dx,dy);
 }
 
 /*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
    block modes and motion vectors to the individual fragments.*/
 static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){
   const oc_mb_map        *mb_maps;
   const signed char      *mb_modes;
   oc_set_chroma_mvs_func  set_chroma_mvs;
-  oc_mv_comp_unpack_func  mv_comp_unpack;
+  const ogg_int16_t      *mv_comp_tree;
   oc_fragment            *frags;
   oc_mv                  *frag_mvs;
   const unsigned char    *map_idxs;
   int                     map_nidxs;
-  oc_mv                   last_mv[2];
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
   oc_mv                   cbmvs[4];
   size_t                  nmbs;
   size_t                  mbi;
   long                    val;
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
   val=oc_pack_read1(&_dec->opb);
-  mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
+  mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE;
   map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
   map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
-  memset(last_mv,0,sizeof(last_mv));
+  prior_mv=last_mv=0;
   frags=_dec->state.frags;
   frag_mvs=_dec->state.frag_mvs;
   mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   mb_modes=_dec->state.mb_modes;
   nmbs=_dec->state.nmbs;
   for(mbi=0;mbi<nmbs;mbi++){
-    int          mb_mode;
+    int mb_mode;
     mb_mode=mb_modes[mbi];
     if(mb_mode!=OC_MODE_INVALID){
-      oc_mv        mbmv;
-      ptrdiff_t    fragi;
-      int          coded[13];
-      int          codedi;
-      int          ncoded;
-      int          mapi;
-      int          mapii;
-      /*Search for at least one coded fragment.*/
-      ncoded=mapii=0;
-      do{
-        mapi=map_idxs[mapii];
-        fragi=mb_maps[mbi][mapi>>2][mapi&3];
-        if(frags[fragi].coded)coded[ncoded++]=mapi;
-      }
-      while(++mapii<map_nidxs);
-      if(ncoded<=0)continue;
-      switch(mb_mode){
-        case OC_MODE_INTER_MV_FOUR:{
-          oc_mv       lbmvs[4];
-          int         bi;
-          /*Mark the tail of the list, so we don't accidentally go past it.*/
-          coded[ncoded]=-1;
-          for(bi=codedi=0;bi<4;bi++){
-            if(coded[codedi]==bi){
-              codedi++;
-              fragi=mb_maps[mbi][0][bi];
-              frags[fragi].mb_mode=mb_mode;
-              lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-              lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-              memcpy(frag_mvs[fragi],lbmvs[bi],sizeof(lbmvs[bi]));
-            }
-            else lbmvs[bi][0]=lbmvs[bi][1]=0;
+      oc_mv     mbmv;
+      ptrdiff_t fragi;
+      int       mapi;
+      int       mapii;
+      int       refi;
+      if(mb_mode==OC_MODE_INTER_MV_FOUR){
+        oc_mv lbmvs[4];
+        int   bi;
+        prior_mv=last_mv;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+            frag_mvs[fragi]=lbmvs[bi];
           }
-          if(codedi>0){
-            memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-            memcpy(last_mv[0],lbmvs[coded[codedi-1]],sizeof(last_mv[0]));
+          else lbmvs[bi]=0;
+        }
+        (*set_chroma_mvs)(cbmvs,lbmvs);
+        for(mapii=4;mapii<map_nidxs;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][mapi>>2][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            frag_mvs[fragi]=cbmvs[bi];
           }
-          if(codedi<ncoded){
-            (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
-            for(;codedi<ncoded;codedi++){
-              mapi=coded[codedi];
-              bi=mapi&3;
-              fragi=mb_maps[mbi][mapi>>2][bi];
-              frags[fragi].mb_mode=mb_mode;
-              memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(cbmvs[bi]));
-            }
+        }
+      }
+      else{
+        switch(mb_mode){
+          case OC_MODE_INTER_MV:{
+            prior_mv=last_mv;
+            last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
+          case OC_MODE_INTER_MV_LAST2:{
+            mbmv=prior_mv;
+            prior_mv=last_mv;
+            last_mv=mbmv;
+          }break;
+          case OC_MODE_GOLDEN_MV:{
+            mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          default:mbmv=0;break;
+        }
+        /*Fill in the MVs for the fragments.*/
+        refi=OC_FRAME_FOR_MODE(mb_mode);
+        mapii=0;
+        do{
+          mapi=map_idxs[mapii];
+          fragi=mb_maps[mbi][mapi>>2][mapi&3];
+          if(frags[fragi].coded){
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mbmv;
           }
-        }break;
-        case OC_MODE_INTER_MV:{
-          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-          mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-          mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-        }break;
-        case OC_MODE_INTER_MV_LAST:memcpy(mbmv,last_mv[0],sizeof(mbmv));break;
-        case OC_MODE_INTER_MV_LAST2:{
-          memcpy(mbmv,last_mv[1],sizeof(mbmv));
-          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-          memcpy(last_mv[0],mbmv,sizeof(last_mv[0]));
-        }break;
-        case OC_MODE_GOLDEN_MV:{
-          mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-          mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-        }break;
-        default:memset(mbmv,0,sizeof(mbmv));break;
-      }
-      /*4MV mode fills in the fragments itself.
-        For all other modes we can use this common code.*/
-      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
-        for(codedi=0;codedi<ncoded;codedi++){
-          mapi=coded[codedi];
-          fragi=mb_maps[mbi][mapi>>2][mapi&3];
-          frags[fragi].mb_mode=mb_mode;
-          memcpy(frag_mvs[fragi],mbmv,sizeof(mbmv));
         }
+        while(++mapii<map_nidxs);
       }
     }
   }
 }
 
 static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
   oc_fragment     *frags;
   const ptrdiff_t *coded_fragis;
@@ -1296,44 +1318,26 @@ static int oc_dec_postprocess_init(oc_de
     memcpy(_dec->pp_frame_buf+1,
      _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]]+1,
      sizeof(_dec->pp_frame_buf[1])*2);
   }
   return 0;
 }
 
 
-
-typedef struct{
-  int                 bounding_values[256];
-  ptrdiff_t           ti[3][64];
-  ptrdiff_t           eob_runs[3][64];
-  const ptrdiff_t    *coded_fragis[3];
-  const ptrdiff_t    *uncoded_fragis[3];
-  ptrdiff_t           ncoded_fragis[3];
-  ptrdiff_t           nuncoded_fragis[3];
-  const ogg_uint16_t *dequant[3][3][2];
-  int                 fragy0[3];
-  int                 fragy_end[3];
-  int                 pred_last[3][3];
-  int                 mcu_nvfrags;
-  int                 loop_filter;
-  int                 pp_level;
-}oc_dec_pipeline_state;
-
-
-
 /*Initialize the main decoding pipeline.*/
 static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe){
   const ptrdiff_t *coded_fragis;
   const ptrdiff_t *uncoded_fragis;
+  int              flimit;
   int              pli;
   int              qii;
   int              qti;
+  int              zzi;
   /*If chroma is sub-sampled in the vertical direction, we have to decode two
      super block rows of Y' for each super block row of Cb and Cr.*/
   _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
   /*Initialize the token and extra bits indices for each plane and
      coefficient.*/
   memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti));
   /*Also copy over the initial the EOB run counts.*/
   memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
@@ -1355,37 +1359,40 @@ static void oc_dec_pipeline_init(oc_dec_
         _pipe->dequant[pli][qii][qti]=
          _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
       }
     }
   }
   /*Set the previous DC predictor to 0 for all color planes and frame types.*/
   memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
   /*Initialize the bounding value array for the loop filter.*/
-  _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state,
-   _pipe->bounding_values);
+  flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
   /*Initialize any buffers needed for post-processing.
     We also save the current post-processing level, to guard against the user
      changing it from a callback.*/
   if(!oc_dec_postprocess_init(_dec))_pipe->pp_level=_dec->pp_level;
   /*If we don't have enough information to post-process, disable it, regardless
      of the user-requested level.*/
   else{
     _pipe->pp_level=OC_PP_LEVEL_DISABLED;
     memcpy(_dec->pp_frame_buf,
      _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
      sizeof(_dec->pp_frame_buf[0])*3);
   }
+  /*Clear down the DCT coefficient buffer for the first block.*/
+  for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
 }
 
 /*Undo the DC prediction in a single plane of an MCU (one or two super block
    rows).
   As a side effect, the number of coded and uncoded fragments in this plane of
    the MCU is also computed.*/
-static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
   const oc_fragment_plane *fplane;
   oc_fragment             *frags;
   int                     *pred_last;
   ptrdiff_t                ncoded_fragis;
   ptrdiff_t                fragi;
   int                      fragx;
   int                      fragy;
@@ -1403,63 +1410,61 @@ static void oc_dec_dc_unpredict_mcu_plan
   ncoded_fragis=0;
   fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
   for(fragy=fragy0;fragy<fragy_end;fragy++){
     if(fragy==0){
       /*For the first row, all of the cases reduce to just using the previous
          predictor for the same reference frame.*/
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         if(frags[fragi].coded){
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
-          pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+          int refi;
+          refi=frags[fragi].refi;
+          pred_last[refi]=frags[fragi].dc+=pred_last[refi];
           ncoded_fragis++;
         }
       }
     }
     else{
       oc_fragment *u_frags;
       int          l_ref;
       int          ul_ref;
       int          u_ref;
       u_frags=frags-nhfrags;
       l_ref=-1;
       ul_ref=-1;
-      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      u_ref=u_frags[fragi].refi;
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         int ur_ref;
         if(fragx+1>=nhfrags)ur_ref=-1;
-        else{
-          ur_ref=u_frags[fragi+1].coded?
-           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
-        }
+        else ur_ref=u_frags[fragi+1].refi;
         if(frags[fragi].coded){
           int pred;
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          int refi;
+          refi=frags[fragi].refi;
           /*We break out a separate case based on which of our neighbors use
              the same reference frames.
             This is somewhat faster than trying to make a generic case which
              handles all of them, since it reduces lots of poorly predicted
              jumps to one switch statement, and also lets a number of the
              multiplications be optimized out by strength reduction.*/
-          switch((l_ref==ref)|(ul_ref==ref)<<1|
-           (u_ref==ref)<<2|(ur_ref==ref)<<3){
-            default:pred=pred_last[ref];break;
+          switch((l_ref==refi)|(ul_ref==refi)<<1|
+           (u_ref==refi)<<2|(ur_ref==refi)<<3){
+            default:pred=pred_last[refi];break;
             case  1:
             case  3:pred=frags[fragi-1].dc;break;
             case  2:pred=u_frags[fragi-1].dc;break;
             case  4:
             case  6:
             case 12:pred=u_frags[fragi].dc;break;
             case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
             case  8:pred=u_frags[fragi+1].dc;break;
             case  9:
             case 11:
             case 13:{
+              /*The TI compiler mis-compiles this line.*/
               pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
             }break;
             case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
             case 14:{
               pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
                +10*u_frags[fragi].dc)/16;
             }break;
             case  7:
@@ -1471,36 +1476,36 @@ static void oc_dec_dc_unpredict_mcu_plan
               p1=u_frags[fragi-1].dc;
               p2=u_frags[fragi].dc;
               pred=(29*(p0+p2)-26*p1)/32;
               if(abs(pred-p2)>128)pred=p2;
               else if(abs(pred-p0)>128)pred=p0;
               else if(abs(pred-p1)>128)pred=p1;
             }break;
           }
-          pred_last[ref]=frags[fragi].dc+=pred;
+          pred_last[refi]=frags[fragi].dc+=pred;
           ncoded_fragis++;
-          l_ref=ref;
+          l_ref=refi;
         }
         else l_ref=-1;
         ul_ref=u_ref;
         u_ref=ur_ref;
       }
     }
   }
   _pipe->ncoded_fragis[_pli]=ncoded_fragis;
   /*Also save the number of uncoded fragments so we know how many to copy.*/
   _pipe->nuncoded_fragis[_pli]=
    (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
 }
 
 /*Reconstructs all coded fragments in a single MCU (one or two super block
    rows).
   This requires that each coded fragment have a proper macro block mode and
-   motion vector (if not in INTRA mode), and have it's DC value decoded, with
+   motion vector (if not in INTRA mode), and have its DC value decoded, with
    the DC prediction process reversed, and the number of coded and uncoded
    fragments in this plane of the MCU be counted.
   The token lists for each color plane and coefficient should also be filled
    in, along with initial token offsets, extra bits offsets, and EOB run
    counts.*/
 static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
   unsigned char       *dct_tokens;
@@ -1517,26 +1522,21 @@ static void oc_dec_frags_recon_mcu_plane
   dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
   frags=_dec->state.frags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
   ti=_pipe->ti[_pli];
   eob_runs=_pipe->eob_runs[_pli];
   for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
   for(fragii=0;fragii<ncoded_fragis;fragii++){
-    /*This array is made one element larger because the zig-zag index array
-       uses the final element as a dumping ground for out-of-range indices
-       to protect us from buffer overflow.*/
-    OC_ALIGN8(ogg_int16_t dct_coeffs[65]);
     const ogg_uint16_t *ac_quant;
     ptrdiff_t           fragi;
     int                 last_zzi;
     int                 zzi;
     fragi=coded_fragis[fragii];
-    for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
     qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
     /*Decode the AC coefficients.*/
     for(zzi=0;zzi<64;){
       int token;
       last_zzi=zzi;
       if(eob_runs[zzi]){
         eob_runs[zzi]--;
@@ -1563,44 +1563,50 @@ static void oc_dec_frags_recon_mcu_plane
           if(eob==0)eob=OC_DCT_EOB_FINISH;
         }
         rlen=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
         cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
         coeff=cw>>OC_DCT_CW_MAG_SHIFT;
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
         zzi+=rlen;
-        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
+         (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
         zzi+=!eob;
       }
     }
     /*TODO: zzi should be exactly 64 here.
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
-    dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+    _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,fragi,_pli,
-     dct_coeffs,last_zzi,dc_quant[qti]);
+     _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
   /*TODO: We make the decision here to always copy the uncoded blocks into it
      from the reference frame.
     We could also copy the coded blocks back over the reference frame, if we
      wait for an additional MCU to be decoded, which might be faster if only a
      small number of blocks are coded.
     However, this introduces more latency, creating a larger cache footprint.
     It's unknown which decision is better, but this one results in simpler
      code, and the hard case (high bitrate, high resolution) is handled
      correctly.*/
   /*Copy the uncoded blocks from the previous reference frame.*/
-  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
-   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_dec->state,
+     _dec->state.ref_frame_data[OC_FRAME_SELF],
+     _dec->state.ref_frame_data[OC_FRAME_PREV],
+     _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
+  }
 }
 
 /*Filter a horizontal block edge.*/
 static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,int _qstep,int _flimit,
  int *_variance0,int *_variance1){
   unsigned char       *rdst;
   const unsigned char *rsrc;
@@ -1948,29 +1954,29 @@ static void oc_dec_dering_frag_rows(oc_d
   }
 }
 
 
 
 th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
   oc_dec_ctx *dec;
   if(_info==NULL||_setup==NULL)return NULL;
-  dec=_ogg_malloc(sizeof(*dec));
+  dec=oc_aligned_malloc(sizeof(*dec),16);
   if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){
-    _ogg_free(dec);
+    oc_aligned_free(dec);
     return NULL;
   }
   dec->state.curframe_num=0;
   return dec;
 }
 
 void th_decode_free(th_dec_ctx *_dec){
   if(_dec!=NULL){
     oc_dec_clear(_dec);
-    _ogg_free(_dec);
+    oc_aligned_free(_dec);
   }
 }
 
 int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
  size_t _buf_sz){
   switch(_req){
   case TH_DECCTL_GET_PPLEVEL_MAX:{
     if(_dec==NULL||_buf==NULL)return TH_EFAULT;
@@ -2042,82 +2048,103 @@ int th_decode_ctl(th_dec_ctx *_dec,int _
   default:return TH_EIMPL;
   }
 }
 
 /*We're decoding an INTER frame, but have no initialized reference
    buffers (i.e., decoding did not start on a key frame).
   We initialize them to a solid gray here.*/
 static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
-  th_info *info;
-  size_t   yplane_sz;
-  size_t   cplane_sz;
-  int      yhstride;
-  int      yheight;
-  int      chstride;
-  int      cheight;
+  th_info   *info;
+  size_t     yplane_sz;
+  size_t     cplane_sz;
+  ptrdiff_t  yoffset;
+  int        yhstride;
+  int        yheight;
+  int        chstride;
+  int        cheight;
   _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
   _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
-  _dec->state.ref_frame_idx[OC_FRAME_SELF]=1;
+  _dec->state.ref_frame_idx[OC_FRAME_SELF]=0;
+  _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+   _dec->state.ref_frame_data[OC_FRAME_PREV]=
+   _dec->state.ref_frame_data[OC_FRAME_SELF]=
+   _dec->state.ref_frame_bufs[0][0].data;
+  memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0],
+   sizeof(_dec->pp_frame_buf[0])*3);
   info=&_dec->state.info;
-  yhstride=info->frame_width+2*OC_UMV_PADDING;
+  yhstride=abs(_dec->state.ref_ystride[0]);
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>!(info->pixel_fmt&1);
+  chstride=abs(_dec->state.ref_ystride[1]);
   cheight=yheight>>!(info->pixel_fmt&2);
-  yplane_sz=yhstride*(size_t)yheight;
+  yplane_sz=yhstride*(size_t)yheight+16;
   cplane_sz=chstride*(size_t)cheight;
-  memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
+  yoffset=_dec->state.ref_ystride[0]*(yheight-1)-
+   (OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride);
+  memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz);
 }
 
 int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
  ogg_int64_t *_granpos){
   int ret;
   if(_dec==NULL||_op==NULL)return TH_EFAULT;
   /*A completely empty packet indicates a dropped frame and is treated exactly
-     like an inter frame with no coded blocks.
-    Only proceed if we have a non-empty packet.*/
-  if(_op->bytes!=0){
-    oc_dec_pipeline_state pipe;
-    th_ycbcr_buffer       stripe_buf;
-    int                   stripe_fragy;
-    int                   refi;
-    int                   pli;
-    int                   notstart;
-    int                   notdone;
+     like an inter frame with no coded blocks.*/
+  if(_op->bytes==0){
+    _dec->state.frame_type=OC_INTER_FRAME;
+    _dec->state.ntotal_coded_fragis=0;
+  }
+  else{
     oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
+    ret=oc_dec_frame_header_unpack(_dec);
+    if(ret<0)return ret;
+    if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec);
+    else oc_dec_coded_flags_unpack(_dec);
+  }
+  /*If there have been no reference frames, and we need one, initialize one.*/
+  if(_dec->state.frame_type!=OC_INTRA_FRAME&&
+   (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
+   _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
+    oc_dec_init_dummy_frame(_dec);
+  }
+  /*If this was an inter frame with no coded blocks...*/
+  if(_dec->state.ntotal_coded_fragis<=0){
+    /*Just update the granule position and return.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    return TH_DUPFRAME;
+  }
+  else{
+    th_ycbcr_buffer stripe_buf;
+    int             stripe_fragy;
+    int             refi;
+    int             pli;
+    int             notstart;
+    int             notdone;
+    /*Select a free buffer to use for the reconstructed version of this frame.*/
+    for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
+     refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+    _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+    _dec->state.ref_frame_data[OC_FRAME_SELF]=
+     _dec->state.ref_frame_bufs[refi][0].data;
 #if defined(HAVE_CAIRO)
     _dec->telemetry_frame_bytes=_op->bytes;
 #endif
-    ret=oc_dec_frame_header_unpack(_dec);
-    if(ret<0)return ret;
-    /*Select a free buffer to use for the reconstructed version of this
-       frame.*/
-    if(_dec->state.frame_type!=OC_INTRA_FRAME&&
-     (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
-     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
-      /*No reference frames yet!*/
-      oc_dec_init_dummy_frame(_dec);
-      refi=_dec->state.ref_frame_idx[OC_FRAME_SELF];
-    }
-    else{
-      for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
-       refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
-      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
-    }
     if(_dec->state.frame_type==OC_INTRA_FRAME){
-      oc_dec_mark_all_intra(_dec);
       _dec->state.keyframe_num=_dec->state.curframe_num;
 #if defined(HAVE_CAIRO)
       _dec->telemetry_coding_bytes=
        _dec->telemetry_mode_bytes=
        _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
 #endif
     }
     else{
-      oc_dec_coded_flags_unpack(_dec);
 #if defined(HAVE_CAIRO)
       _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb);
 #endif
       oc_dec_mb_modes_unpack(_dec);
 #if defined(HAVE_CAIRO)
       _dec->telemetry_mode_bytes=oc_pack_bytes_left(&_dec->opb);
 #endif
       oc_dec_mv_unpack_and_frag_modes_fill(_dec);
@@ -2155,85 +2182,87 @@ int th_decode_packetin(th_dec_ctx *_dec,
       Thus the actual number of decoded rows available is slightly smaller for
        the first MCU, and slightly larger for the last.
 
       This entire process allows us to operate on the data while it is still in
        cache, resulting in big performance improvements.
       An application callback allows further application processing (blitting
        to video memory, color conversion, etc.) to also use the data while it's
        in cache.*/
-    oc_dec_pipeline_init(_dec,&pipe);
+    oc_dec_pipeline_init(_dec,&_dec->pipe);
     oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
     notstart=0;
     notdone=1;
-    for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+    for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){
       int avail_fragy0;
       int avail_fragy_end;
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
-      notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
+      notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end;
       for(pli=0;pli<3;pli++){
         oc_fragment_plane *fplane;
         int                frag_shift;
         int                pp_offset;
         int                sdelay;
         int                edelay;
         fplane=_dec->state.fplanes+pli;
         /*Compute the first and last fragment row of the current MCU for this
            plane.*/
         frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
-        pipe.fragy0[pli]=stripe_fragy>>frag_shift;
-        pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
-         pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
-        oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
-        oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
+        _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli);
         sdelay=edelay=0;
-        if(pipe.loop_filter){
+        if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
-          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
-           refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          oc_state_loop_filter_frag_rows(&_dec->state,
+           _dec->pipe.bounding_values,OC_FRAME_SELF,pli,
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a
            fragment in the next row could filter its top edge, using two pixels
            from a fragment in this row.
           But there's no reason to delay a full fragment between the two.*/
         oc_state_borders_fill_rows(&_dec->state,refi,pli,
-         (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
-         (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+         (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
         /*Out-of-loop post-processing.*/
         pp_offset=3*(pli!=0);
-        if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+        if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
           /*Perform de-blocking in one plane.*/
           sdelay+=notstart;
           edelay+=notdone;
           oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
            _dec->state.ref_frame_bufs[refi],pli,
-           pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-          if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+          if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
             /*Perform de-ringing in one plane.*/
             sdelay+=notstart;
             edelay+=notdone;
             oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
-             pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+             _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
           }
         }
         /*If no post-processing is done, we still need to delay a row for the
            loop filter, thanks to the strange filtering order VP3 chose.*/
-        else if(pipe.loop_filter){
+        else if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
         }
         /*Compute the intersection of the available rows in all planes.
           If chroma is sub-sampled, the effect of each of its delays is
            doubled, but luma might have more post-processing filters enabled
            than chroma, so we don't know up front which one is the limiting
            factor.*/
-        avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy0=OC_MINI(avail_fragy0,
+         _dec->pipe.fragy0[pli]-sdelay<<frag_shift);
         avail_fragy_end=OC_MINI(avail_fragy_end,
-         pipe.fragy_end[pli]-edelay<<frag_shift);
+         _dec->pipe.fragy_end[pli]-edelay<<frag_shift);
       }
       if(_dec->stripe_cb.stripe_decoded!=NULL){
         /*The callback might want to use the FPU, so let's make sure they can.
           We violate all kinds of ABI restrictions by not doing this until
            now, but none of them actually matter since we don't use floating
            point ourselves.*/
         oc_restore_fpu(&_dec->state);
         /*Make the callback, ensuring we flip the sense of the "start" and
@@ -2247,50 +2276,36 @@ int th_decode_packetin(th_dec_ctx *_dec,
     /*Finish filling in the reference frame borders.*/
     for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
     /*Update the reference frame indices.*/
     if(_dec->state.frame_type==OC_INTRA_FRAME){
       /*The new frame becomes both the previous and gold reference frames.*/
       _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
        _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     else{
       /*Otherwise, just replace the previous reference frame.*/
       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
        gamma values, if nothing else).*/
     oc_restore_fpu(&_dec->state);
 #if defined(OC_DUMP_IMAGES)
-    /*Don't dump images for dropped frames.*/
+    /*We only dump images if there were some coded blocks.*/
     oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
 #endif
     return 0;
   }
-  else{
-    if(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
-     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0){
-      int refi;
-      /*No reference frames yet!*/
-      oc_dec_init_dummy_frame(_dec);
-      refi=_dec->state.ref_frame_idx[OC_FRAME_PREV];
-      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
-      memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[refi],
-       sizeof(_dec->pp_frame_buf[0])*3);
-    }
-    /*Just update the granule position and return.*/
-    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
-     _dec->state.info.keyframe_granule_shift)
-     +(_dec->state.curframe_num-_dec->state.keyframe_num);
-    _dec->state.curframe_num++;
-    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
-    return TH_DUPFRAME;
-  }
 }
 
 int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){
   if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT;
   oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf);
 #if defined(HAVE_CAIRO)
   /*If telemetry ioctls are active, we need to draw to the output buffer.
     Stuff the plane into cairo.*/
@@ -2425,22 +2440,24 @@ int th_decode_ycbcr_out(th_dec_ctx *_dec
             cairo_set_source_rgba(c,1.,0,0,.5);
             cairo_rectangle(c,x+2.5,y+2.5,11,11);
             cairo_stroke_preserve(c);
             cairo_set_source_rgba(c,1.,0,0,.25);
             cairo_fill(c);
           }
         }
         else{
-          const signed char *frag_mv;
-          ptrdiff_t          fragi;
+          ptrdiff_t fragi;
+          int       frag_mvx;
+          int       frag_mvy;
           for(bi=0;bi<4;bi++){
             fragi=mb_maps[mbi][0][bi];
             if(fragi>=0&&frags[fragi].coded){
-              frag_mv=frag_mvs[fragi];
+              frag_mvx=OC_MV_X(frag_mvs[fragi]);
+              frag_mvy=OC_MV_Y(frag_mvs[fragi]);
               break;
             }
           }
           if(bi<4){
             switch(mb_modes[mbi]){
               case OC_MODE_INTRA:{
                 if(_dec->telemetry_mbmode&0x02){
                   cairo_set_source_rgba(c,1.,0,0,.5);
@@ -2461,46 +2478,46 @@ int th_decode_ycbcr_out(th_dec_ctx *_dec
               }break;
               case OC_MODE_INTER_MV:{
                 if(_dec->telemetry_mbmode&0x04){
                   cairo_rectangle(c,x+2.5,y+2.5,11,11);
                   cairo_set_source_rgba(c,0,1.,0,.5);
                   cairo_stroke(c);
                 }
                 if(_dec->telemetry_mv&0x04){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+8,y+8);
                   cairo_stroke(c);
                 }
               }break;
               case OC_MODE_INTER_MV_LAST:{
                 if(_dec->telemetry_mbmode&0x08){
                   cairo_rectangle(c,x+2.5,y+2.5,11,11);
                   cairo_set_source_rgba(c,0,1.,0,.5);
                   cairo_move_to(c,x+13.5,y+2.5);
                   cairo_line_to(c,x+2.5,y+8);
                   cairo_line_to(c,x+13.5,y+13.5);
                   cairo_stroke(c);
                 }
                 if(_dec->telemetry_mv&0x08){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+8,y+8);
                   cairo_stroke(c);
                 }
               }break;
               case OC_MODE_INTER_MV_LAST2:{
                 if(_dec->telemetry_mbmode&0x10){
@@ -2510,23 +2527,23 @@ int th_decode_ycbcr_out(th_dec_ctx *_dec
                   cairo_line_to(c,x+2.5,y+8);
                   cairo_line_to(c,x+8,y+13.5);
                   cairo_move_to(c,x+13.5,y+2.5);
                   cairo_line_to(c,x+8,y+8);
                   cairo_line_to(c,x+13.5,y+13.5);
                   cairo_stroke(c);
                 }
                 if(_dec->telemetry_mv&0x10){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+8,y+8);
                   cairo_stroke(c);
                 }
               }break;
               case OC_MODE_GOLDEN_NOMV:{
                 if(_dec->telemetry_mbmode&0x20){
@@ -2539,23 +2556,23 @@ int th_decode_ycbcr_out(th_dec_ctx *_dec
               }break;
               case OC_MODE_GOLDEN_MV:{
                 if(_dec->telemetry_mbmode&0x40){
                   cairo_rectangle(c,x+2.5,y+2.5,11,11);
                   cairo_set_source_rgba(c,1.,1.,0,.5);
                   cairo_stroke(c);
                 }
                 if(_dec->telemetry_mv&0x40){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+8,y+8);
                   cairo_stroke(c);
                 }
               }break;
               case OC_MODE_INTER_MV_FOUR:{
                 if(_dec->telemetry_mbmode&0x80){
@@ -2564,69 +2581,73 @@ int th_decode_ycbcr_out(th_dec_ctx *_dec
                   cairo_rectangle(c,x+2.5,y+9.5,4,4);
                   cairo_rectangle(c,x+9.5,y+9.5,4,4);
                   cairo_set_source_rgba(c,0,1.,0,.5);
                   cairo_stroke(c);
                 }
                 /*4mv is odd, coded in raster order.*/
                 fragi=mb_maps[mbi][0][0];
                 if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+4+frag_mv[0],y+12-frag_mv[1]);
+                  frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                  frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                  cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
+                  cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
+                  cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+4,y+12);
                   cairo_stroke(c);
                 }
                 fragi=mb_maps[mbi][0][1];
                 if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+12+frag_mv[0],y+12-frag_mv[1]);
+                  frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                  frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                  cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
+                  cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
+                  cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+12,y+12);
                   cairo_stroke(c);
                 }
                 fragi=mb_maps[mbi][0][2];
                 if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+4+frag_mv[0],y+4-frag_mv[1]);
+                  frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                  frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                  cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
+                  cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
+                  cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+4,y+4);
                   cairo_stroke(c);
                 }
                 fragi=mb_maps[mbi][0][3];
                 if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+12+frag_mv[0],y+4-frag_mv[1]);
+                  frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                  frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                  cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy);
                   cairo_set_source_rgba(c,1.,1.,1.,.9);
                   cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
+                  cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
+                  cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33);
                   cairo_stroke_preserve(c);
                   cairo_set_line_width(c,1.);
                   cairo_line_to(c,x+12,y+4);
                   cairo_stroke(c);
                 }
               }break;
             }
           }
deleted file mode 100644
--- a/media/libtheora/lib/encint.h
+++ /dev/null
@@ -1,493 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-#if !defined(_encint_H)
-# define _encint_H (1)
-# if defined(HAVE_CONFIG_H)
-#  include "config.h"
-# endif
-# include "theora/theoraenc.h"
-# include "internal.h"
-# include "ocintrin.h"
-# include "mathops.h"
-# include "enquant.h"
-# include "huffenc.h"
-/*# define OC_COLLECT_METRICS*/
-
-
-
-typedef oc_mv                         oc_mv2[2];
-
-typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
-typedef struct oc_mb_enc_info         oc_mb_enc_info;
-typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
-typedef struct oc_iir_filter          oc_iir_filter;
-typedef struct oc_frame_metrics       oc_frame_metrics;
-typedef struct oc_rc_state            oc_rc_state;
-typedef struct th_enc_ctx             oc_enc_ctx;
-typedef struct oc_token_checkpoint    oc_token_checkpoint;
-
-
-
-/*Constants for the packet-out state machine specific to the encoder.*/
-
-/*Next packet to emit: Data packet, but none are ready yet.*/
-#define OC_PACKET_EMPTY (0)
-/*Next packet to emit: Data packet, and one is ready.*/
-#define OC_PACKET_READY (1)
-
-/*All features enabled.*/
-#define OC_SP_LEVEL_SLOW       (0)
-/*Enable early skip.*/
-#define OC_SP_LEVEL_EARLY_SKIP (1)
-/*Disable motion compensation.*/
-#define OC_SP_LEVEL_NOMC       (2)
-/*Maximum valid speed level.*/
-#define OC_SP_LEVEL_MAX        (2)
-
-
-/*The bits used for each of the MB mode codebooks.*/
-extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
-
-/*The bits used for each of the MV codebooks.*/
-extern const unsigned char OC_MV_BITS[2][64];
-
-/*The minimum value that can be stored in a SB run for each codeword.
-  The last entry is the upper bound on the length of a single SB run.*/
-extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
-/*The bits used for each SB run codeword.*/
-extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
-
-/*The bits used for each block run length (starting with 1).*/
-extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
-
-
-
-/*Encoder specific functions with accelerated variants.*/
-struct oc_enc_opt_vtable{
-  unsigned (*frag_sad)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  unsigned (*frag_sad_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_satd_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
-  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  void     (*frag_sub_128)(ogg_int16_t _diff[64],
-   const unsigned char *_src,int _ystride);
-  void     (*frag_copy2)(unsigned char *_dst,
-   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
-   const ogg_int16_t _residue[64]);
-  void     (*frag_recon_inter)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-};
-
-
-void oc_enc_vtable_init(oc_enc_ctx *_enc);
-
-
-
-/*Encoder-specific macroblock information.*/
-struct oc_mb_enc_info{
-  /*Neighboring macro blocks that have MVs available from the current frame.*/
-  unsigned      cneighbors[4];
-  /*Neighboring macro blocks to use for MVs from the previous frame.*/
-  unsigned      pneighbors[4];
-  /*The number of current-frame neighbors.*/
-  unsigned char ncneighbors;
-  /*The number of previous-frame neighbors.*/
-  unsigned char npneighbors;
-  /*Flags indicating which MB modes have been refined.*/
-  unsigned char refined;
-  /*Motion vectors for a macro block for the current frame and the
-     previous two frames.
-    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
-     can be used to estimate constant velocity and constant acceleration
-     predictors.
-    Uninitialized MVs are (0,0).*/
-  oc_mv2        analysis_mv[3];
-  /*Current unrefined analysis MVs.*/
-  oc_mv         unref_mv[2];
-  /*Unrefined block MVs.*/
-  oc_mv         block_mv[4];
-  /*Refined block MVs.*/
-  oc_mv         ref_mv[4];
-  /*Minimum motion estimation error from the analysis stage.*/
-  ogg_uint16_t  error[2];
-  /*MB error for half-pel refinement for each frame type.*/
-  unsigned      satd[2];
-  /*Block error for half-pel refinement.*/
-  unsigned      block_satd[4];
-};
-
-
-
-/*State machine to estimate the opportunity cost of coding a MB mode.*/
-struct oc_mode_scheme_chooser{
-  /*Pointers to the a list containing the index of each mode in the mode
-     alphabet used by each scheme.
-    The first entry points to the dynamic scheme0_ranks, while the remaining 7
-     point to the constant entries stored in OC_MODE_SCHEMES.*/
-  const unsigned char *mode_ranks[8];
-  /*The ranks for each mode when coded with scheme 0.
-    These are optimized so that the more frequent modes have lower ranks.*/
-  unsigned char        scheme0_ranks[OC_NMODES];
-  /*The list of modes, sorted in descending order of frequency, that
-    corresponds to the ranks above.*/
-  unsigned char        scheme0_list[OC_NMODES];
-  /*The number of times each mode has been chosen so far.*/
-  int                  mode_counts[OC_NMODES];
-  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
-  unsigned char        scheme_list[8];
-  /*The number of bits used by each mode coding scheme.*/
-  ptrdiff_t            scheme_bits[8];
-};
-
-
-void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
-
-
-
-/*A 2nd order low-pass Bessel follower.
-  We use this for rate control because it has fast reaction time, but is
-   critically damped.*/
-struct oc_iir_filter{
-  ogg_int32_t c[2];
-  ogg_int64_t g;
-  ogg_int32_t x[2];
-  ogg_int32_t y[2];
-};
-
-
-
-/*The 2-pass metrics associated with a single frame.*/
-struct oc_frame_metrics{
-  /*The log base 2 of the scale factor for this frame in Q24 format.*/
-  ogg_int32_t   log_scale;
-  /*The number of application-requested duplicates of this frame.*/
-  unsigned      dup_count:31;
-  /*The frame type from pass 1.*/
-  unsigned      frame_type:1;
-};
-
-
-
-/*Rate control state information.*/
-struct oc_rc_state{
-  /*The target average bits per frame.*/
-  ogg_int64_t        bits_per_frame;
-  /*The current buffer fullness (bits available to be used).*/
-  ogg_int64_t        fullness;
-  /*The target buffer fullness.
-    This is where we'd like to be by the last keyframe the appears in the next
-     buf_delay frames.*/
-  ogg_int64_t        target;
-  /*The maximum buffer fullness (total size of the buffer).*/
-  ogg_int64_t        max;
-  /*The log of the number of pixels in a frame in Q57 format.*/
-  ogg_int64_t        log_npixels;
-  /*The exponent used in the rate model in Q8 format.*/
-  unsigned           exp[2];
-  /*The number of frames to distribute the buffer usage over.*/
-  int                buf_delay;
-  /*The total drop count from the previous frame.
-    This includes duplicates explicitly requested via the
-     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
-  ogg_uint32_t       prev_drop_count;
-  /*The log of an estimated scale factor used to obtain the real framerate, for
-     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
-  ogg_int64_t        log_drop_scale;
-  /*The log of estimated scale factor for the rate model in Q57 format.*/
-  ogg_int64_t        log_scale[2];
-  /*The log of the target quantizer level in Q57 format.*/
-  ogg_int64_t        log_qtarget;
-  /*Will we drop frames to meet bitrate target?*/
-  unsigned char      drop_frames;
-  /*Do we respect the maximum buffer fullness?*/
-  unsigned char      cap_overflow;
-  /*Can the reservoir go negative?*/
-  unsigned char      cap_underflow;
-  /*Second-order lowpass filters to track scale and VFR.*/
-  oc_iir_filter      scalefilter[2];
-  int                inter_count;
-  int                inter_delay;
-  int                inter_delay_target;
-  oc_iir_filter      vfrfilter;
-  /*Two-pass mode state.
-    0 => 1-pass encoding.
-    1 => 1st pass of 2-pass encoding.
-    2 => 2nd pass of 2-pass encoding.*/
-  int                twopass;
-  /*Buffer for current frame metrics.*/
-  unsigned char      twopass_buffer[48];
-  /*The number of bytes in the frame metrics buffer.
-    When 2-pass encoding is enabled, this is set to 0 after each frame is
-     submitted, and must be non-zero before the next frame will be accepted.*/
-  int                twopass_buffer_bytes;
-  int                twopass_buffer_fill;
-  /*Whether or not to force the next frame to be a keyframe.*/
-  unsigned char      twopass_force_kf;
-  /*The metrics for the previous frame.*/
-  oc_frame_metrics   prev_metrics;
-  /*The metrics for the current frame.*/
-  oc_frame_metrics   cur_metrics;
-  /*The buffered metrics for future frames.*/
-  oc_frame_metrics  *frame_metrics;
-  int                nframe_metrics;
-  int                cframe_metrics;
-  /*The index of the current frame in the circular metric buffer.*/
-  int                frame_metrics_head;
-  /*The frame count of each type (keyframes, delta frames, and dup frames);
-     32 bits limits us to 2.268 years at 60 fps.*/
-  ogg_uint32_t       frames_total[3];
-  /*The number of frames of each type yet to be processed.*/
-  ogg_uint32_t       frames_left[3];
-  /*The sum of the scale values for each frame type.*/
-  ogg_int64_t        scale_sum[2];
-  /*The start of the window over which the current scale sums are taken.*/
-  int                scale_window0;
-  /*The end of the window over which the current scale sums are taken.*/
-  int                scale_window_end;
-  /*The frame count of each type in the current 2-pass window; this does not
-     include dup frames.*/
-  int                nframes[3];
-  /*The total accumulated estimation bias.*/
-  ogg_int64_t        rate_bias;
-};
-
-
-void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
-void oc_rc_state_clear(oc_rc_state *_rc);
-
-void oc_enc_rc_resize(oc_enc_ctx *_enc);
-int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
-void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
-int oc_enc_update_rc_state(oc_enc_ctx *_enc,
- long _bits,int _qti,int _qi,int _trial,int _droppable);
-int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
-int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
-
-
-
-/*The internal encoder state.*/
-struct th_enc_ctx{
-  /*Shared encoder/decoder state.*/
-  oc_theora_state          state;
-  /*Buffer in which to assemble packets.*/
-  oggpack_buffer           opb;
-  /*Encoder-specific macroblock information.*/
-  oc_mb_enc_info          *mb_info;
-  /*DC coefficients after prediction.*/
-  ogg_int16_t             *frag_dc;
-  /*The list of coded macro blocks, in coded order.*/
-  unsigned                *coded_mbis;
-  /*The number of coded macro blocks.*/
-  size_t                   ncoded_mbis;
-  /*Whether or not packets are ready to be emitted.
-    This takes on negative values while there are remaining header packets to
-     be emitted, reaches 0 when the codec is ready for input, and becomes
-     positive when a frame has been processed and data packets are ready.*/
-  int                      packet_state;
-  /*The maximum distance between keyframes.*/
-  ogg_uint32_t             keyframe_frequency_force;
-  /*The number of duplicates to produce for the next frame.*/
-  ogg_uint32_t             dup_count;
-  /*The number of duplicates remaining to be emitted for the current frame.*/
-  ogg_uint32_t             nqueued_dups;
-  /*The number of duplicates emitted for the last frame.*/
-  ogg_uint32_t             prev_dup_count;
-  /*The current speed level.*/
-  int                      sp_level;
-  /*Whether or not VP3 compatibility mode has been enabled.*/
-  unsigned char            vp3_compatible;
-  /*Whether or not any INTER frames have been coded.*/
-  unsigned char            coded_inter_frame;
-  /*Whether or not previous frame was dropped.*/
-  unsigned char            prevframe_dropped;
-  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
-     coefficients, and luma and chroma tokens.
-    The actual Huffman table used for a given coefficient depends not only on
-     the choice made here, but also its index in the zig-zag ordering.*/
-  unsigned char            huff_idxs[2][2][2];
-  /*Current count of bits used by each MV coding mode.*/
-  size_t                   mv_bits[2];
-  /*The mode scheme chooser for estimating mode coding costs.*/
-  oc_mode_scheme_chooser   chooser;
-  /*The number of vertical super blocks in an MCU.*/
-  int                      mcu_nvsbs;
-  /*The SSD error for skipping each fragment in the current MCU.*/
-  unsigned                *mcu_skip_ssd;
-  /*The DCT token lists for each coefficient and each plane.*/
-  unsigned char          **dct_tokens[3];
-  /*The extra bits associated with each DCT token.*/
-  ogg_uint16_t           **extra_bits[3];
-  /*The number of DCT tokens for each coefficient for each plane.*/
-  ptrdiff_t                ndct_tokens[3][64];
-  /*Pending EOB runs for each coefficient for each plane.*/
-  ogg_uint16_t             eob_run[3][64];
-  /*The offset of the first DCT token for each coefficient for each plane.*/
-  unsigned char            dct_token_offs[3][64];
-  /*The last DC coefficient for each plane and reference frame.*/
-  int                      dc_pred_last[3][3];
-#if defined(OC_COLLECT_METRICS)
-  /*Fragment SATD statistics for MB mode estimation metrics.*/
-  unsigned                *frag_satd;
-  /*Fragment SSD statistics for MB mode estimation metrics.*/
-  unsigned                *frag_ssd;
-#endif
-  /*The R-D optimization parameter.*/
-  int                      lambda;
-  /*The huffman tables in use.*/
-  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
-  /*The quantization parameters in use.*/
-  th_quant_info            qinfo;
-  oc_iquant               *enquant_tables[64][3][2];
-  oc_iquant_table          enquant_table_data[64][3][2];
-  /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
-     value.
-    This is used to paramterize the rate control decisions.
-    They are kept in the log domain to simplify later processing.
-    Keep in mind these are DCT domain quantizers, and so are scaled by an
-     additional factor of 4 from the pixel domain.*/
-  ogg_int64_t              log_qavg[2][64];
-  /*The buffer state used to drive rate control.*/
-  oc_rc_state              rc;
-  /*Table for encoder acceleration functions.*/
-  oc_enc_opt_vtable        opt_vtable;
-};
-
-
-void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
-int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
-#if defined(OC_COLLECT_METRICS)
-void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
-void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
-#endif
-
-
-
-/*Perform fullpel motion search for a single MB against both reference frames.*/
-void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
-/*Refine a MB MV for one frame.*/
-void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
-/*Refine the block MVs.*/
-void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
-
-
-
-/*Used to rollback a tokenlog transaction when we retroactively decide to skip
-   a fragment.
-  A checkpoint is taken right before each token is added.*/
-struct oc_token_checkpoint{
-  /*The color plane the token was added to.*/
-  unsigned char pli;
-  /*The zig-zag index the token was added to.*/
-  unsigned char zzi;
-  /*The outstanding EOB run count before the token was added.*/
-  ogg_uint16_t  eob_run;
-  /*The token count before the token was added.*/
-  ptrdiff_t     ndct_tokens;
-};
-
-
-
-void oc_enc_tokenize_start(oc_enc_ctx *_enc);
-int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
- int _zzi,oc_token_checkpoint **_stack,int _acmin);
-void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
- const oc_token_checkpoint *_stack,int _n);
-void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
- int _pli,int _fragy0,int _frag_yend);
-void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
- const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
- int _prev_ndct_tokens1,int _prev_eob_run1);
-void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
-
-
-
-/*Utility routine to encode one of the header packets.*/
-int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
- oggpack_buffer *_opb,const th_quant_info *_qinfo,
- const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
- const char *_vendor,th_comment *_tc,ogg_packet *_op);
-
-
-
-/*Encoder-specific accelerated functions.*/
-void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
- unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
- const ogg_int16_t _x[64]);
-
-/*Default pure-C implementations.*/
-void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
-
-void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2_c(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-unsigned oc_enc_frag_sad_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
-void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-
-#endif
deleted file mode 100644
--- a/media/libtheora/lib/encoder_disabled.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-#include "apiwrapper.h"
-#include "encint.h"
-
-th_enc_ctx *th_encode_alloc(const th_info *_info){
-  return NULL;
-}
-
-void th_encode_free(th_enc_ctx *_enc){}
-
-
-int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
-  return OC_DISABLED;
-}
-
-int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
-  return OC_DISABLED;
-}
-
-int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-
-
-int theora_encode_init(theora_state *_te,theora_info *_ci){
-  return OC_DISABLED;
-}
-
-int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
-  return OC_DISABLED;
-}
-
-int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_header(theora_state *_te,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_tables(theora_state *_te,ogg_packet *_op){
-  return OC_DISABLED;
-}
deleted file mode 100644
--- a/media/libtheora/lib/enquant.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#if !defined(_enquant_H)
-# define _enquant_H (1)
-# include "quant.h"
-
-typedef struct oc_iquant oc_iquant;
-
-#define OC_QUANT_MAX_LOG (OC_Q57(OC_STATIC_ILOG_32(OC_QUANT_MAX)-1))
-
-/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
-   (i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
-  This is not an approximation; for 16-bit x and d, it is exact.*/
-struct oc_iquant{
-  ogg_int16_t m;
-  ogg_int16_t l;
-};
-
-typedef oc_iquant        oc_iquant_table[64];
-
-
-
-void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
-void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
- ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
-
-#endif
--- a/media/libtheora/lib/fragment.c
+++ b/media/libtheora/lib/fragment.c
@@ -6,82 +6,77 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: fragment.c 17410 2010-09-21 21:53:48Z tterribe $
 
  ********************************************************************/
 #include <string.h>
 #include "internal.h"
 
-void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride){
-  (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
-}
-
 void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
   int i;
   for(i=8;i-->0;){
     memcpy(_dst,_src,8*sizeof(*_dst));
     _dst+=_ystride;
     _src+=_ystride;
   }
 }
 
-void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
- int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    oc_frag_copy_c(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
 }
 
 void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
  const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
     for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+128);
     _dst+=_ystride;
   }
 }
 
-void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
-}
-
 void oc_frag_recon_inter_c(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
     for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+_src[j]);
     _dst+=_ystride;
     _src+=_ystride;
   }
 }
 
-void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride,
- const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
-}
-
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
     for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+(_src1[j]+_src2[j]>>1));
     _dst+=_ystride;
     _src1+=_ystride;
     _src2+=_ystride;
   }
 }
 
-void oc_restore_fpu(const oc_theora_state *_state){
-  _state->opt_vtable.restore_fpu();
-}
-
 void oc_restore_fpu_c(void){}
--- a/media/libtheora/lib/huffdec.c
+++ b/media/libtheora/lib/huffdec.c
@@ -6,35 +6,81 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: huffdec.c 16702 2009-11-15 00:40:55Z tterribe $
+    last mod: $Id: huffdec.c 17577 2010-10-29 04:00:07Z tterribe $
 
  ********************************************************************/
 
 #include <stdlib.h>
 #include <string.h>
 #include <ogg/ogg.h>
 #include "huffdec.h"
 #include "decint.h"
 
 
-/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
-#define _ogg_offsetof(_type,_field)\
- ((size_t)((char *)&((_type *)0)->_field-(char *)0))
+
+/*Instead of storing every branching in the tree, subtrees can be collapsed
+   into one node, with a table of size 1<<nbits pointing directly to its
+   descedents nbits levels down.
+  This allows more than one bit to be read at a time, and avoids following all
+   the intermediate branches with next to no increased code complexity once
+   the collapsed tree has been built.
+  We do _not_ require that a subtree be complete to be collapsed, but instead
+   store duplicate pointers in the table, and record the actual depth of the
+   node below its parent.
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
 
-/*The number of internal tokens associated with each of the spec tokens.*/
-static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
-  1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
-};
+  We store the tree packed in an array of 16-bit integers (words).
+  Each node consists of a single word, followed consecutively by two or more
+   indices of its children.
+  Let n be the value of this first word.
+  This is the number of bits that need to be read to traverse the node, and
+   must be positive.
+  1<<n entries follow in the array, each an index to a child node.
+  If the child is positive, then it is the index of another internal node in
+   the table.
+  If the child is negative or zero, then it is a leaf node.
+  These are stored directly in the child pointer to save space, since they only
+   require a single word.
+  If a leaf node would have been encountered before reading n bits, then it is
+   duplicated the necessary number of times in this table.
+  Leaf nodes pack both a token value and their actual depth in the tree.
+  The token in the leaf node is (-leaf&255).
+  The number of bits that need to be consumed to reach the leaf, starting from
+   the current node, is (-leaf>>8).
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
+
+
 
 /*The map from external spec-defined tokens to internal tokens.
   This is constructed so that any extra bits read with the original token value
    can be masked off the least significant bits of its internal token index.
   In addition, all of the tokens which require additional extra bits are placed
    at the start of the list, and grouped by type.
   OC_DCT_REPEAT_RUN3_TOKEN is placed first, as it is an extra-special case, so
    giving it index 0 may simplify comparisons on some architectures.
@@ -94,396 +140,376 @@ static const unsigned char OC_DCT_TOKEN_
   /*OC_DCT_RUN_CAT1C (4 extra bits)*/
   12,
   /*OC_DCT_RUN_CAT2A (2 extra bits)*/
   28,
   /*OC_DCT_RUN_CAT2B (3 extra bits)*/
   40
 };
 
-/*These three functions are really part of the bitpack.c module, but
-   they are only used here.
-  Declaring local static versions so they can be inlined saves considerable
-   function call overhead.*/
-
-static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
-  const unsigned char *ptr;
-  const unsigned char *stop;
-  oc_pb_window         window;
-  int                  available;
-  window=_b->window;
-  available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  /*This version of _refill() doesn't bother setting eof because we won't
-     check for it after we've started decoding DCT tokens.*/
-  if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  while(available<=OC_PB_WINDOW_SIZE-8){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
-    if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  }
-  _b->ptr=ptr;
-  if(_bits>available)window|=*ptr>>(available&7);
-  _b->bits=available;
-  return window;
-}
-
-
-/*Read in bits without advancing the bit pointer.
-  Here we assume 0<=_bits&&_bits<=32.*/
-static long oc_pack_look(oc_pack_buf *_b,int _bits){
-  oc_pb_window window;
-  int          available;
-  long         result;
-  window=_b->window;
-  available=_b->bits;
-  if(_bits==0)return 0;
-  if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
-  result=window>>OC_PB_WINDOW_SIZE-_bits;
-  return result;
-}
-
-/*Advance the bit pointer.*/
-static void oc_pack_adv(oc_pack_buf *_b,int _bits){
-  /*We ignore the special cases for _bits==0 and _bits==32 here, since they are
-     never used actually used.
-    OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
-     32 bits in a single go, and would require a 32 GB lookup table (assuming
-     8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
-  _b->window<<=_bits;
-  _b->bits-=_bits;
-}
+/*The log base 2 of number of internal tokens associated with each of the spec
+   tokens (i.e., how many of the extra bits are folded into the token value).
+  Increasing the maximum value beyond 3 will enlarge the amount of stack
+   required for tree construction.*/
+static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
+  0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
+};
 
 
-/*The log_2 of the size of a lookup table is allowed to grow to relative to
-   the number of unique nodes it contains.
-  E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
-   wasted (each node will have an amortized cost of at most 20 bytes when using
-   4-byte pointers).
+/*The size a lookup table is allowed to grow to relative to the number of
+   unique nodes it contains.
+  E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
+   wasted (1/4 of the space must be used).
   Larger numbers can decode tokens with fewer read operations, while smaller
-   numbers may save more space (requiring as little as 8 bytes amortized per
-   node, though there will be more nodes).
+   numbers may save more space.
   With a sample file:
   32233473 read calls are required when no tree collapsing is done (100.0%).
-  19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
-  11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
-  10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
-  10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
-  Since a value of 1 gets us the vast majority of the speed-up with only a
-   small amount of wasted memory, this is what we use.*/
-#define OC_HUFF_SLUSH (1)
-
-
-/*Determines the size in bytes of a Huffman tree node that represents a
-   subtree of depth _nbits.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise 1<<_nbits pointers are allocated for children.
-  Return: The number of bytes required to store the node.*/
-static size_t oc_huff_node_size(int _nbits){
-  size_t size;
-  size=_ogg_offsetof(oc_huff_node,nodes);
-  if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
-  return size;
-}
+  19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
+  11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
+  10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
+  10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
+  Since a value of 2 gets us the vast majority of the speed-up with only a
+   small amount of wasted memory, this is what we use.
+  This value must be less than 128, or you could create a tree with more than
+   32767 entries, which would overflow the 16-bit words used to index it.*/
+#define OC_HUFF_SLUSH (2)
+/*The root of the tree is on the fast path, and a larger value here is more
+   beneficial than elsewhere in the tree.
+  7 appears to give the best performance, trading off between increased use of
+   the single-read fast path and cache footprint for the tables, though
+   obviously this will depend on your cache size.
+  Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
+#define OC_ROOT_HUFF_SLUSH (7)
 
-static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
-  oc_huff_node *ret;
-  ret=(oc_huff_node *)*_storage;
-  ret->nbits=(unsigned char)_nbits;
-  (*_storage)+=_size;
-  return ret;
-}
-
-
-/*Determines the size in bytes of a Huffman tree.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise storage for 1<<_nbits pointers are added for children.
-  Return: The number of bytes required to store the tree.*/
-static size_t oc_huff_tree_size(const oc_huff_node *_node){
-  size_t size;
-  size=oc_huff_node_size(_node->nbits);
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
-      size+=oc_huff_tree_size(_node->nodes[i]);
-    }
-  }
-  return size;
-}
 
 
-/*Unpacks a sub-tree from the given buffer.
-  _opb:      The buffer to unpack from.
-  _binodes:  The nodes to store the sub-tree in.
-  _nbinodes: The number of nodes available for the sub-tree.
-  Return: 0 on success, or a negative value on error.*/
-static int oc_huff_tree_unpack(oc_pack_buf *_opb,
- oc_huff_node *_binodes,int _nbinodes){
-  oc_huff_node *binode;
-  long          bits;
-  int           nused;
-  if(_nbinodes<1)return TH_EBADHEADER;
-  binode=_binodes;
-  nused=0;
-  bits=oc_pack_read1(_opb);
-  if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-  /*Read an internal node:*/
-  if(!bits){
-    int ret;
-    nused++;
-    binode->nbits=1;
-    binode->depth=1;
-    binode->nodes[0]=_binodes+nused;
-    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    if(ret>=0){
-      nused+=ret;
-      binode->nodes[1]=_binodes+nused;
-      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
+/*Unpacks a Huffman codebook.
+  _opb:    The buffer to unpack from.
+  _tokens: Stores a list of internal tokens, in the order they were found in
+            the codebook, and the lengths of their corresponding codewords.
+           This is enough to completely define the codebook, while minimizing
+            stack usage and avoiding temporary allocations (for platforms
+            where free() is a no-op).
+  Return: The number of internal tokens in the codebook, or a negative value
+   on error.*/
+int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
+  ogg_uint32_t code;
+  int          len;
+  int          ntokens;
+  int          nleaves;
+  code=0;
+  len=ntokens=nleaves=0;
+  for(;;){
+    long bits;
+    bits=oc_pack_read1(_opb);
+    /*Only process nodes so long as there's more bits in the buffer.*/
+    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+    /*Read an internal node:*/
+    if(!bits){
+      len++;
+      /*Don't allow codewords longer than 32 bits.*/
+      if(len>32)return TH_EBADHEADER;
     }
-    if(ret<0)return ret;
-    nused+=ret;
-  }
-  /*Read a leaf node:*/
-  else{
-    int ntokens;
-    int token;
-    int i;
-    bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
-    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-    /*Find out how many internal tokens we translate this external token into.*/
-    ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
-    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
-    /*Fill in a complete binary tree pointing to the internal tokens.*/
-    for(i=1;i<ntokens;i<<=1){
-      int j;
-      binode=_binodes+nused;
-      nused+=i;
-      for(j=0;j<i;j++){
-        binode[j].nbits=1;
-        binode[j].depth=1;
-        binode[j].nodes[0]=_binodes+nused+2*j;
-        binode[j].nodes[1]=_binodes+nused+2*j+1;
+    /*Read a leaf node:*/
+    else{
+      ogg_uint32_t code_bit;
+      int          neb;
+      int          nentries;
+      int          token;
+      /*Don't allow more than 32 spec-tokens per codebook.*/
+      if(++nleaves>32)return TH_EBADHEADER;
+      bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+      neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
+      token=OC_DCT_TOKEN_MAP[bits];
+      nentries=1<<neb;
+      while(nentries-->0){
+        _tokens[ntokens][0]=(unsigned char)token++;
+        _tokens[ntokens][1]=(unsigned char)(len+neb);
+        ntokens++;
       }
-    }
-    /*And now the leaf nodes with those tokens.*/
-    token=OC_DCT_TOKEN_MAP[bits];
-    for(i=0;i<ntokens;i++){
-      binode=_binodes+nused++;
-      binode->nbits=0;
-      binode->depth=1;
-      binode->token=token+i;
+      code_bit=0x80000000U>>len-1;
+      while(len>0&&(code&code_bit)){
+        code^=code_bit;
+        code_bit<<=1;
+        len--;
+      }
+      if(len<=0)break;
+      code|=code_bit;
     }
   }
-  return nused;
+  return ntokens;
 }
 
-/*Finds the depth of shortest branch of the given sub-tree.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The smallest depth of a leaf node in this sub-tree.
-          0 indicates this sub-tree is a leaf node.*/
-static int oc_huff_tree_mindepth(oc_huff_node *_binode){
-  int depth0;
-  int depth1;
-  if(_binode->nbits==0)return 0;
-  depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
-  depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
-  return OC_MINI(depth0,depth1)+1;
-}
-
-/*Finds the number of internal nodes at a given depth, plus the number of
-   leaves at that depth or shallower.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The number of entries that would be contained in a jump table of the
-           given depth.*/
-static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
-  if(_binode->nbits==0||_depth<=0)return 1;
-  else{
-    return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
+/*Count how many tokens would be required to fill a subtree at depth _depth.
+  _tokens: A list of internal tokens, in the order they are found in the
+            codebook, and the lengths of their corresponding codewords.
+  _depth:  The depth of the desired node in the corresponding tree structure.
+  Return: The number of tokens that belong to that subtree.*/
+static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
+  ogg_uint32_t code;
+  int          ti;
+  code=0;
+  ti=0;
+  do{
+    if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
+    else{
+      /*Because of the expanded internal tokens, we can have codewords as long
+         as 35 bits.
+        A single recursion here is enough to advance past them.*/
+      code++;
+      ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
+    }
   }
+  while(code<0x80000000U);
+  return ti;
 }
 
-/*Makes a copy of the given Huffman tree.
-  _node: The Huffman tree to copy.
-  Return: The copy of the Huffman tree.*/
-static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
- char **_storage){
-  oc_huff_node *ret;
-  ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
-  ret->depth=_node->depth;
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    int inext;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;){
-      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
-      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
-      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
+/*Compute the number of bits to use for a collapsed tree node at the given
+   depth.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  _depth:   The depth of this tree node.
+  Return: The number of bits to use for a collapsed tree node rooted here.
+          This is always at least one, even if this was a leaf node.*/
+static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
+ int _ntokens,int _depth){
+  int got_leaves;
+  int loccupancy;
+  int occupancy;
+  int slush;
+  int nbits;
+  int best_nbits;
+  slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
+  /*It's legal to have a tree with just a single node, which requires no bits
+     to decode and always returns the same token.
+    However, no encoder actually does this (yet).
+    To avoid a special case in oc_huff_token_decode(), we force the number of
+     lookahead bits to be at least one.
+    This will produce a tree that looks ahead one bit and then advances the
+     stream zero bits.*/
+  nbits=1;
+  occupancy=2;
+  got_leaves=1;
+  do{
+    int ti;
+    if(got_leaves)best_nbits=nbits;
+    nbits++;
+    got_leaves=0;
+    loccupancy=occupancy;
+    for(occupancy=ti=0;ti<_ntokens;occupancy++){
+      if(_tokens[ti][1]<_depth+nbits)ti++;
+      else if(_tokens[ti][1]==_depth+nbits){
+        got_leaves=1;
+        ti++;
+      }
+      else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
     }
   }
-  else ret->token=_node->token;
-  return ret;
+  while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
+  return best_nbits;
 }
 
-static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
-  size_t size;
-  int    mindepth;
-  int    depth;
-  int    loccupancy;
-  int    occupancy;
-  if(_binode->nbits!=0&&_depth>0){
-    return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
-  }
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
-  do{
-    loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
-  }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  size=oc_huff_node_size(depth);
-  if(depth>0){
-    size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
-    size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
-  }
-  return size;
+/*Determines the size in words of a Huffman tree node that represents a
+   subtree of depth _nbits.
+  _nbits: The depth of the subtree.
+          This must be greater than zero.
+  Return: The number of words required to store the node.*/
+static size_t oc_huff_node_size(int _nbits){
+  return 1+(1<<_nbits);
 }
 
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage);
-
-/*Fills the given nodes table with all the children in the sub-tree at the
-   given depth.
-  The nodes in the sub-tree with a depth less than that stored in the table
-   are freed.
-  The sub-tree must be binary and complete up until the given depth.
-  _nodes:  The nodes table to fill.
-  _binode: The root of the sub-tree to fill it with.
-           _binode->nbits must be 0 or 1.
-  _level:  The current level in the table.
-           0 indicates that the current node should be stored, regardless of
-            whether it is a leaf node or an internal node.
-  _depth:  The depth of the nodes to fill the table with, relative to their
-            parent.*/
-static void oc_huff_node_fill(oc_huff_node **_nodes,
- oc_huff_node *_binode,int _level,int _depth,char **_storage){
-  if(_level<=0||_binode->nbits==0){
-    int i;
-    _binode->depth=(unsigned char)(_depth-_level);
-    _nodes[0]=oc_huff_tree_collapse(_binode,_storage);
-    for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
-  }
-  else{
-    _level--;
-    oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
-    _nodes+=1<<_level;
-    oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
+/*Produces a collapsed-tree representation of the given token list.
+  _tree: The storage for the collapsed Huffman tree.
+         This may be NULL to compute the required storage size instead of
+          constructing the tree.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
+ unsigned char _tokens[][2],int _ntokens){
+  ogg_int16_t   node[34];
+  unsigned char depth[34];
+  unsigned char last[34];
+  size_t        ntree;
+  int           ti;
+  int           l;
+  depth[0]=0;
+  last[0]=(unsigned char)(_ntokens-1);
+  ntree=0;
+  ti=0;
+  l=0;
+  do{
+    int nbits;
+    nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
+    node[l]=(ogg_int16_t)ntree;
+    ntree+=oc_huff_node_size(nbits);
+    if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
+    do{
+      while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
+        if(_tree!=NULL){
+          ogg_int16_t leaf;
+          int         nentries;
+          nentries=1<<depth[l]+nbits-_tokens[ti][1];
+          leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
+          while(nentries-->0)_tree[node[l]++]=leaf;
+        }
+        ti++;
+      }
+      if(ti<=last[l]){
+        /*We need to recurse*/
+        depth[l+1]=(unsigned char)(depth[l]+nbits);
+        if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
+        l++;
+        last[l]=
+         (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
+        break;
+      }
+      /*Pop back up a level of recursion.*/
+      else if(l-->0)nbits=depth[l+1]-depth[l];
+    }
+    while(l>=0);
   }
-}
-
-/*Finds the largest complete sub-tree rooted at the current node and collapses
-   it into a single node.
-  This procedure is then applied recursively to all the children of that node.
-  _binode: The root of the sub-tree to collapse.
-           _binode->nbits must be 0 or 1.
-  Return: The new root of the collapsed sub-tree.*/
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage){
-  oc_huff_node *root;
-  size_t        size;
-  int           mindepth;
-  int           depth;
-  int           loccupancy;
-  int           occupancy;
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
-  do{
-    loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
-  }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  if(depth<=0)return oc_huff_tree_copy(_binode,_storage);
-  size=oc_huff_node_size(depth);
-  root=oc_huff_node_init(_storage,size,depth);
-  root->depth=_binode->depth;
-  oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
-  return root;
+  while(l>=0);
+  return ntree;
 }
 
 /*Unpacks a set of Huffman trees, and reduces them to a collapsed
    representation.
   _opb:   The buffer to unpack the trees from.
   _nodes: The table to fill with the Huffman trees.
-  Return: 0 on success, or a negative value on error.*/
+  Return: 0 on success, or a negative value on error.
+          The caller is responsible for cleaning up any partially initialized
+           _nodes on failure.*/
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    oc_huff_node  nodes[511];
-    char         *storage;
-    size_t        size;
-    int           ret;
+    unsigned char  tokens[256][2];
+    int            ntokens;
+    ogg_int16_t   *tree;
+    size_t         size;
     /*Unpack the full tree into a temporary buffer.*/
-    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
-    if(ret<0)return ret;
-    /*Figure out how big the collapsed tree will be.*/
-    size=oc_huff_tree_collapse_size(nodes,0);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL)return TH_EFAULT;
-    /*And collapse it.*/
-    _nodes[i]=oc_huff_tree_collapse(nodes,&storage);
+    ntokens=oc_huff_tree_unpack(_opb,tokens);
+    if(ntokens<0)return ntokens;
+    /*Figure out how big the collapsed tree will be and allocate space for it.*/
+    size=oc_huff_tree_collapse(NULL,tokens,ntokens);
+    /*This should never happen; if it does it means you set OC_HUFF_SLUSH or
+       OC_ROOT_HUFF_SLUSH too large.*/
+    if(size>32767)return TH_EIMPL;
+    tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
+    if(tree==NULL)return TH_EFAULT;
+    /*Construct the collapsed the tree.*/
+    oc_huff_tree_collapse(tree,tokens,ntokens);
+    _nodes[i]=tree;
   }
   return 0;
 }
 
+/*Determines the size in words of a Huffman subtree.
+  _tree: The complete Huffman tree.
+  _node: The index of the root of the desired subtree.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
+  size_t size;
+  int    nchildren;
+  int    n;
+  int    i;
+  n=_tree[_node];
+  size=oc_huff_node_size(n);
+  nchildren=1<<n;
+  i=0;
+  do{
+    int child;
+    child=_tree[_node+i+1];
+    if(child<=0)i+=1<<n-(-child>>8);
+    else{
+      size+=oc_huff_tree_size(_tree,child);
+      i++;
+    }
+  }
+  while(i<nchildren);
+  return size;
+}
+
 /*Makes a copy of the given set of Huffman trees.
   _dst: The array to store the copy in.
   _src: The array of trees to copy.*/
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
+  int total;
   int i;
+  total=0;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    size_t  size;
-    char   *storage;
-    size=oc_huff_tree_size(_src[i]);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL){
+    size_t size;
+    size=oc_huff_tree_size(_src[i],0);
+    total+=size;
+    _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
+    if(_dst[i]==NULL){
       while(i-->0)_ogg_free(_dst[i]);
       return TH_EFAULT;
     }
-    _dst[i]=oc_huff_tree_copy(_src[i],&storage);
+    memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
   }
   return 0;
 }
 
 /*Frees the memory used by a set of Huffman trees.
   _nodes: The array of trees to free.*/
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
 }
 
+
 /*Unpacks a single token using the given Huffman tree.
   _opb:  The buffer to unpack the token from.
   _node: The tree to unpack the token with.
   Return: The token value.*/
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
-  long bits;
-  while(_node->nbits!=0){
-    bits=oc_pack_look(_opb,_node->nbits);
-    _node=_node->nodes[bits];
-    oc_pack_adv(_opb,_node->depth);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  long                 bits;
+  int                  node;
+  int                  n;
+  ptr=_opb->ptr;
+  window=_opb->window;
+  stop=_opb->stop;
+  available=_opb->bits;
+  node=0;
+  for(;;){
+    n=_tree[node];
+    if(n>available){
+      unsigned shift;
+      shift=OC_PB_WINDOW_SIZE-available;
+      do{
+        /*We don't bother setting eof because we won't check for it after we've
+           started decoding DCT tokens.*/
+        if(ptr>=stop){
+          shift=(unsigned)-OC_LOTS_OF_BITS;
+          break;
+        }
+        shift-=8;
+        window|=(oc_pb_window)*ptr++<<shift;
+      }
+      while(shift>=8);
+      /*Note: We never request more than 24 bits, so there's no need to fill in
+         the last partial byte here.*/
+      available=OC_PB_WINDOW_SIZE-shift;
+    }
+    bits=window>>OC_PB_WINDOW_SIZE-n;
+    node=_tree[node+1+bits];
+    if(node<=0)break;
+    window<<=n;
+    available-=n;
   }
-  return _node->token;
+  node=-node;
+  n=node>>8;
+  window<<=n;
+  available-=n;
+  _opb->ptr=ptr;
+  _opb->window=window;
+  _opb->bits=available;
+  return node&255;
 }
--- a/media/libtheora/lib/huffdec.h
+++ b/media/libtheora/lib/huffdec.h
@@ -6,87 +6,27 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: huffdec.h 17410 2010-09-21 21:53:48Z tterribe $
 
  ********************************************************************/
 
 #if !defined(_huffdec_H)
 # define _huffdec_H (1)
 # include "huffman.h"
 # include "bitpack.h"
 
 
 
-typedef struct oc_huff_node oc_huff_node;
-
-/*A node in the Huffman tree.
-  Instead of storing every branching in the tree, subtrees can be collapsed
-   into one node, with a table of size 1<<nbits pointing directly to its
-   descedents nbits levels down.
-  This allows more than one bit to be read at a time, and avoids following all
-   the intermediate branches with next to no increased code complexity once
-   the collapsed tree has been built.
-  We do _not_ require that a subtree be complete to be collapsed, but instead
-   store duplicate pointers in the table, and record the actual depth of the
-   node below its parent.
-  This tells us the number of bits to advance the stream after reaching it.
-
-  This turns out to be equivalent to the method described in \cite{Hash95},
-   without the requirement that codewords be sorted by length.
-  If the codewords were sorted by length (so-called ``canonical-codes''), they
-   could be decoded much faster via either Lindell and Moffat's approach or
-   Hashemian's Condensed Huffman Code approach, the latter of which has an
-   extremely small memory footprint.
-  We can't use Choueka et al.'s finite state machine approach, which is
-   extremely fast, because we can't allow multiple symbols to be output at a
-   time; the codebook can and does change between symbols.
-  It also has very large memory requirements, which impairs cache coherency.
-
-  @ARTICLE{Hash95,
-    author="Reza Hashemian",
-    title="Memory Efficient and High-Speed Search {Huffman} Coding",
-    journal="{IEEE} Transactions on Communications",
-    volume=43,
-    number=10,
-    pages="2576--2581",
-    month=Oct,
-    year=1995
-  }*/
-struct oc_huff_node{
-  /*The number of bits of the code needed to descend through this node.
-    0 indicates a leaf node.
-    Otherwise there are 1<<nbits nodes in the nodes table, which can be
-     indexed by reading nbits bits from the stream.*/
-  unsigned char  nbits;
-  /*The value of a token stored in a leaf node.
-    The value in non-leaf nodes is undefined.*/
-  unsigned char  token;
-  /*The depth of the current node, relative to its parent in the collapsed
-     tree.
-    This can be less than its parent's nbits value, in which case there are
-     1<<nbits-depth copies of this node in the table, and the bitstream should
-     only be advanced depth bits after reaching this node.*/
-  unsigned char  depth;
-  /*The table of child nodes.
-    The ACTUAL size of this array is 1<<nbits, despite what the declaration
-     below claims.
-    The exception is that for leaf nodes the size is 0.*/
-  oc_huff_node  *nodes[2];
-};
-
-
-
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
-
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
 
 #endif
deleted file mode 100644
--- a/media/libtheora/lib/huffenc.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#if !defined(_huffenc_H)
-# define _huffenc_H (1)
-# include "huffman.h"
-
-
-
-typedef th_huff_code                  th_huff_table[TH_NDCT_TOKENS];
-
-
-
-extern const th_huff_code
- TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
-
-
-
-int oc_huff_codes_pack(oggpack_buffer *_opb,
- const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
-
-#endif
--- a/media/libtheora/lib/idct.c
+++ b/media/libtheora/lib/idct.c
@@ -6,17 +6,17 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: idct.c 17410 2010-09-21 21:53:48Z tterribe $
 
  ********************************************************************/
 
 #include <string.h>
 #include "internal.h"
 #include "dct.h"
 
 /*Performs an inverse 8 point Type-II DCT transform.
@@ -226,28 +226,28 @@ static void idct8_1(ogg_int16_t *_y,cons
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_2(w,_x);
   idct8_1(w+1,_x+8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.
   All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
    x  x  x  x  0  0  0  0
    x  x  x  0  0  0  0  0
@@ -255,60 +255,54 @@ static void oc_idct8x8_3(ogg_int16_t _y[
    x  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_4(w,_x);
   idct8_3(w+1,_x+8);
   idct8_2(w+2,_x+16);
   idct8_1(w+3,_x+24);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+  for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
-  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(w+i,_x+i*8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
-}
-
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- int _last_zzi){
-  (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
      decoded.
     In most cases this is an EOB token (the continuation of an EOB run from a
      previous block counts), and so this is the same as the coefficient count.
     However, in the case that the last token was NOT an EOB token, but filled
      the block up with exactly 64 coefficients, _last_zzi will be less than 64.
@@ -324,12 +318,12 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int
     Although convoluted, this is arguably the correct behavior: it allows us to
      use a smaller transform when the block ends with a long zero run instead
      of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
-  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
-  else oc_idct8x8_slow(_y,_y);
+  if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
+  else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
--- a/media/libtheora/lib/internal.c
+++ b/media/libtheora/lib/internal.c
@@ -6,17 +6,17 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: internal.c 17506 2010-10-13 02:52:41Z tterribe $
 
  ********************************************************************/
 
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
 #include "internal.h"
 
@@ -92,89 +92,39 @@ const unsigned char OC_DCT_TOKEN_EXTRA_B
 int oc_ilog(unsigned _v){
   int ret;
   for(ret=0;_v;ret++)_v>>=1;
   return ret;
 }
 
 
 
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X and Y directions
-   (4:2:0).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
-}
-
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the Y direction.
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[2][0];
-  dy=_lbmvs[0][1]+_lbmvs[2][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[1][0]+_lbmvs[3][0];
-  dy=_lbmvs[1][1]+_lbmvs[3][1];
-  _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+void *oc_aligned_malloc(size_t _sz,size_t _align){
+  unsigned char *p;
+  if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
+  p=(unsigned char *)_ogg_malloc(_sz+_align);
+  if(p!=NULL){
+    int offs;
+    offs=((p-(unsigned char *)0)-1&_align-1);
+    p[offs]=offs;
+    p+=offs+1;
+  }
+  return p;
 }
 
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X direction (4:2:2).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+void oc_aligned_free(void *_ptr){
+  unsigned char *p;
+  p=(unsigned char *)_ptr;
+  if(p!=NULL){
+    int offs;
+    offs=*--p;
+    _ogg_free(p-offs);
+  }
 }
 
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with no chroma decimation (4:4:4).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lmbmv: The luma macro-block level motion vector to fill in for use in
-           prediction.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
-}
-
-/*A table of functions used to fill in the chroma plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.*/
-const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
-};
-
-
 
 void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
   size_t  rowsz;
   size_t  colsz;
   size_t  datsz;
   char   *ret;
   colsz=_height*sizeof(void *);
   rowsz=_sz*_width;
--- a/media/libtheora/lib/internal.h
+++ b/media/libtheora/lib/internal.h
@@ -6,397 +6,88 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
-    last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: internal.h 17578 2010-10-29 04:21:26Z tterribe $
 
  ********************************************************************/
 #if !defined(_internal_H)
 # define _internal_H (1)
 # include <stdlib.h>
 # include <limits.h>
 # if defined(HAVE_CONFIG_H)
-#  include <config.h>
+#  include "config.h"
 # endif
 # include "theora/codec.h"
 # include "theora/theora.h"
+# include "ocintrin.h"
+
+# if !defined(__GNUC_PREREQ)
+#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
+#   define __GNUC_PREREQ(_maj,_min) \
+ ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
+#  else
+#   define __GNUC_PREREQ(_maj,_min) 0
+#  endif
+# endif
 
 # if defined(_MSC_VER)
 /*Disable missing EMMS warnings.*/
 #  pragma warning(disable:4799)
 /*Thank you Microsoft, I know the order of operations.*/
 #  pragma warning(disable:4554)
 # endif
 /*You, too, gcc.*/
-# if defined(__GNUC_PREREQ)
-#  if __GNUC_PREREQ(4,2)
-#   pragma GCC diagnostic ignored "-Wparentheses"
-#  endif
+# if __GNUC_PREREQ(4,2)
+#  pragma GCC diagnostic ignored "-Wparentheses"
 # endif
 
-# include "ocintrin.h"
-# include "huffman.h"
-# include "quant.h"
-
-/*Some assembly constructs require aligned operands.*/
-# if defined(OC_X86_ASM)
+/*Some assembly constructs require aligned operands.
+  The following macros are _only_ intended for structure member declarations.
+  Although they will sometimes work on stack variables, gcc will often silently
+   ignore them.
+  A separate set of macros could be made for manual stack alignment, but we
+   don't actually require it anywhere.*/
+# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
 #  if defined(__GNUC__)
 #   define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
 #   define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
 #  elif defined(_MSC_VER)
 #   define OC_ALIGN8(expr) __declspec (align(8)) expr
 #   define OC_ALIGN16(expr) __declspec (align(16)) expr
+#  else
+#   error "Alignment macros required for this platform."
 #  endif
 # endif
 # if !defined(OC_ALIGN8)
 #  define OC_ALIGN8(expr) expr
 # endif
 # if !defined(OC_ALIGN16)
 #  define OC_ALIGN16(expr) expr
 # endif
 
 
 
-typedef struct oc_sb_flags              oc_sb_flags;
-typedef struct oc_border_info           oc_border_info;