Bug 498770 - Enable optimized Theora code in Windows builds - r=kinetik rs=roc
authorDavid Schleef <ds@schleef.org>
Fri, 19 Jun 2009 15:03:45 +1200
changeset 29340 240cad5e94b690dd035e86a15f436f44f8f3f244
parent 29339 517488f53cb81c6da0c97942e4ef725f8384020f
child 29341 c5e23f5c7e7bb8bf79e2e25a4e4b0eff1cbe9b05
push id7586
push usercdouble@mozilla.com
push dateFri, 19 Jun 2009 06:25:19 +0000
treeherdermozilla-central@592ca0329827 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskinetik, roc
bugs498770
milestone1.9.2a1pre
Bug 498770 - Enable optimized Theora code in Windows builds - r=kinetik rs=roc
media/libtheora/README_MOZILLA
media/libtheora/bug498770.patch
media/libtheora/lib/Makefile.in
media/libtheora/lib/dec/x86_vc/mmxfrag.c
media/libtheora/lib/dec/x86_vc/mmxloopfilter.c
media/libtheora/update.sh
--- a/media/libtheora/README_MOZILLA
+++ b/media/libtheora/README_MOZILLA
@@ -1,13 +1,14 @@
 The source from this directory was copied from the libtheora-1.0
 source distribution using the update.sh script. The changes made were
 those applied by update.sh, the addition/update of Makefile.in files
 for the Mozilla build system and the patch in bug below.
 
+bug498770.patch - Enable optimized theora code in windows build
 Bug 455357 - WinCE LibTheora Pre-defined Macro usage in local variable
   455357_wince_local_variable_macro_clash_patch
   This patch is needed for building WinCE / WinMobile because the 
   Mozilla WinCE Shunt Library currently includes windows.h header file,
   which causes a conflict with local variables in the oc_dering_block()
   function.  This issue should be cleared up soon, with a reworking of 
   the WinCE Shunt Library (Bug 456788 - reduce windows ce shunt impact).  
   Until then, this simple patch allows WinCE to finish compiling.
new file mode 100644
--- /dev/null
+++ b/media/libtheora/bug498770.patch
@@ -0,0 +1,97 @@
+Index: lib/dec/x86_vc/mmxfrag.c
+===================================================================
+--- lib/dec/x86_vc/mmxfrag.c	(revision 16142)
++++ lib/dec/x86_vc/mmxfrag.c	(working copy)
+@@ -27,12 +27,14 @@
+ 
+ void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+  const ogg_int16_t *_residue){
++  int _save_ebx;
+   /* ---------------------------------------------------------------------
+   This function does the inter reconstruction step with 8 iterations
+   unrolled. The iteration for each instruction is noted by the #id in the
+   comments (in case you want to reconstruct it)
+   --------------------------------------------------------------------- */
+   _asm{
++    mov       [_save_ebx], ebx
+     mov       edi, [_residue]     /* load residue ptr     */
+     mov       eax, 0x00800080     /* generate constant    */
+     mov       ebx, [_dst_ystride] /* load dst-stride      */
+@@ -93,6 +95,7 @@
+     packuswb  mm3, mm4            /* #8 pack to byte      */
+     movq      [edx + ecx*2], mm1  /* #7 write row         */
+     movq      [edx + eax], mm3    /* #8 write row         */
++    mov       ebx, [_save_ebx]
+   }
+ }
+ 
+@@ -100,6 +103,7 @@
+ 
+ void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
+  const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
++  int _save_ebx;
+   /* ---------------------------------------------------------------------
+   This function does the inter reconstruction step with two iterations
+   running in parallel to hide some load-latencies and break the dependency
+@@ -107,6 +111,7 @@
+   comments (in case you want to reconstruct it)
+   --------------------------------------------------------------------- */
+   _asm{
++    mov       [_save_ebx], ebx
+     pxor      mm0, mm0          /* generate constant 0 */
+     mov       esi, [_src]
+     mov       edi, [_residue]
+@@ -143,6 +148,7 @@
+     movq      [edx + ebx], mm7  /* #2 write row          */
+     lea       edx, [edx+ebx*2]  /* dst += stride * 2     */
+     jne       nextchunk
++    mov       ebx, [_save_ebx]
+   }
+ }
+ 
+@@ -150,6 +156,7 @@
+ void oc_frag_recon_inter2_mmx(unsigned char *_dst,  int _dst_ystride,
+  const unsigned char *_src1,  int _src1_ystride, const unsigned char *_src2,
+  int _src2_ystride,const ogg_int16_t *_residue){
++  int _save_ebx;
+   /* ---------------------------------------------------------------------
+   This function does the inter2 reconstruction step.The building of the
+   average is done with a bit-twiddeling trick to avoid excessive register
+@@ -166,6 +173,7 @@
+   using the pavgb instruction let me know and I'll do the 3dnow codepath.
+   --------------------------------------------------------------------- */
+  _asm{
++   mov        [_save_ebx], ebx
+    mov        eax, 0xfefefefe
+    mov        esi, [_src1]
+    mov        edi, [_src2]
+@@ -204,6 +212,7 @@
+    packuswb   mm2,  mm3           /* pack and saturate   */
+    movq       [edx], mm2          /* write row           */
+    jne        nextrow
++   mov        ebx, [_save_ebx]
+  }
+ }
+ 
+Index: lib/dec/x86_vc/mmxloopfilter.c
+===================================================================
+--- lib/dec/x86_vc/mmxloopfilter.c	(revision 16142)
++++ lib/dec/x86_vc/mmxloopfilter.c	(working copy)
+@@ -38,7 +38,7 @@
+   _asm {
+     mov       eax,  [_pix]
+     mov       edx,  [_ystride]
+-    mov       ebx,  [_ll]
++    mov       ecx,  [_ll]
+ 
+     /* _pix -= ystride */
+     sub       eax,   edx
+@@ -104,7 +104,7 @@
+     /*Free up mm5.*/
+     packuswb  mm4, mm5
+     /*mm0=L L L L*/
+-    movq      mm0, [ebx]
++    movq      mm0, [ecx]
+     /*if(R_i<-2L||R_i>2L)R_i=0:*/
+     movq      mm5, mm2
+     pxor      mm6, mm6
--- a/media/libtheora/lib/Makefile.in
+++ b/media/libtheora/lib/Makefile.in
@@ -43,22 +43,20 @@ include $(DEPTH)/config/autoconf.mk
 MODULE		= theora
 LIBRARY_NAME	= theora
 FORCE_STATIC_LIB= 1
 
 # The encoder is currently not included.
 DEFINES += -DTHEORA_DISABLE_ENCODE
 
 ifeq ($(findstring 86,$(OS_TEST)), 86)
-ifneq ($(OS_ARCH),WINNT)
 ifneq ($(OS_ARCH),SunOS)
 DEFINES += -DOC_X86ASM -DUSE_ASM
 endif
 endif
-endif
 
 VPATH		:= $(srcdir) $(srcdir)/dec
 
 CSRCS		= \
 		cpu.c \
 		huffdec.c \
 		quant.c \
 		dequant.c \
@@ -70,21 +68,33 @@ CSRCS		= \
 		state.c \
 		info.c \
 		fragment.c \
 		apiwrapper.c \
 		decode.c \
 		$(NULL)
 
 ifeq ($(findstring 86,$(OS_TEST)), 86)
+ifeq ($(OS_ARCH),WINNT)
+VPATH		+= $(srcdir)/dec/x86_vc
+
+CSRCS		+= \
+		mmxfrag.c \
+		mmxloopfilter.c \
+		x86state.c \
+		mmxstate.c \
+		mmxidct.c \
+		$(NULL)
+else
 VPATH		+= $(srcdir)/dec/x86
 
 CSRCS		+= \
 		mmxfrag.c \
 		x86state.c \
 		mmxstate.c \
 		mmxidct.c \
 		$(NULL)
 endif
+endif
 
 include $(topsrcdir)/config/rules.mk
 
 LOCAL_INCLUDES = -I$(srcdir)
--- a/media/libtheora/lib/dec/x86_vc/mmxfrag.c
+++ b/media/libtheora/lib/dec/x86_vc/mmxfrag.c
@@ -22,22 +22,24 @@
 
   Initial implementation 2007 by Nils Pipenbrinck.
   ---------------------------------------------------------------------*/
 
 #if defined(USE_ASM)
 
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
+  int _save_ebx;
   /* ---------------------------------------------------------------------
   This function does the inter reconstruction step with 8 iterations
   unrolled. The iteration for each instruction is noted by the #id in the
   comments (in case you want to reconstruct it)
   --------------------------------------------------------------------- */
   _asm{
+    mov       [_save_ebx], ebx
     mov       edi, [_residue]     /* load residue ptr     */
     mov       eax, 0x00800080     /* generate constant    */
     mov       ebx, [_dst_ystride] /* load dst-stride      */
     mov       edx, [_dst]         /* load dest pointer    */
 
     /* unrolled loop begins here */
 
     movd      mm0, eax            /* load constant        */
@@ -88,30 +90,33 @@ void oc_frag_recon_intra_mmx(unsigned ch
     paddsw    mm1, mm0            /* #7 bias low  residue */
     paddsw    mm2, mm0            /* #7 bias high residue */
     packuswb  mm1, mm2            /* #7 pack to byte      */
     paddsw    mm3, mm0            /* #8 bias low  residue */
     paddsw    mm4, mm0            /* #8 bias high residue */
     packuswb  mm3, mm4            /* #8 pack to byte      */
     movq      [edx + ecx*2], mm1  /* #7 write row         */
     movq      [edx + eax], mm3    /* #8 write row         */
+    mov       ebx, [_save_ebx]
   }
 }
 
 
 
 void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
  const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+  int _save_ebx;
   /* ---------------------------------------------------------------------
   This function does the inter reconstruction step with two iterations
   running in parallel to hide some load-latencies and break the dependency
   chains. The iteration for each instruction is noted by the #id in the
   comments (in case you want to reconstruct it)
   --------------------------------------------------------------------- */
   _asm{
+    mov       [_save_ebx], ebx
     pxor      mm0, mm0          /* generate constant 0 */
     mov       esi, [_src]
     mov       edi, [_residue]
     mov       eax, [_src_ystride]
     mov       edx, [_dst]
     mov       ebx, [_dst_ystride]
     mov       ecx, 4
 
@@ -138,39 +143,42 @@ nextchunk:
     add       edi, 32           /* residue += 4          */
     paddsw    mm7, mm5          /* #2 add residium low   */
     sub       ecx, 1            /* update loop counter   */
     packuswb  mm7, mm2          /* #2 final row          */
     lea       esi, [esi+eax*2]  /* src += stride * 2     */
     movq      [edx + ebx], mm7  /* #2 write row          */
     lea       edx, [edx+ebx*2]  /* dst += stride * 2     */
     jne       nextchunk
+    mov       ebx, [_save_ebx]
   }
 }
 
 
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,  int _dst_ystride,
  const unsigned char *_src1,  int _src1_ystride, const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
+  int _save_ebx;
   /* ---------------------------------------------------------------------
   This function does the inter2 reconstruction step.The building of the
   average is done with a bit-twiddeling trick to avoid excessive register
   copy work during byte to word conversion.
 
               average = (a & b) + (((a ^ b) & 0xfe) >> 1);
 
   (shown for a single byte; it's done with 8 of them at a time)
 
   Slightly faster than the obvious method using add and shift, but not
   earthshaking improvement either.
 
   If anyone comes up with a way that produces bit-identical outputs
   using the pavgb instruction let me know and I'll do the 3dnow codepath.
   --------------------------------------------------------------------- */
  _asm{
+   mov        [_save_ebx], ebx
    mov        eax, 0xfefefefe
    mov        esi, [_src1]
    mov        edi, [_src2]
    movd       mm1, eax
    mov        ebx, [_residue]
    mov        edx, [_dst]
    mov        eax, [_dst_ystride]
    punpckldq  mm1, mm1            /* replicate lsb32     */
@@ -199,16 +207,17 @@ nextrow:
    punpckhbw  mm3,  mm0           /* average high        */
    punpcklbw  mm2,  mm0           /* average low         */
    paddsw     mm3,  mm6           /* high + residue      */
    paddsw     mm2,  mm5           /* low  + residue      */
    sub        ecx,  1             /* update loop counter */
    packuswb   mm2,  mm3           /* pack and saturate   */
    movq       [edx], mm2          /* write row           */
    jne        nextrow
+   mov        ebx, [_save_ebx]
  }
 }
 
 void oc_restore_fpu_mmx(void){
   _asm { emms }
 }
 
 #endif
--- a/media/libtheora/lib/dec/x86_vc/mmxloopfilter.c
+++ b/media/libtheora/lib/dec/x86_vc/mmxloopfilter.c
@@ -33,17 +33,17 @@
 
 
 
 static void loop_filter_v(unsigned char *_pix,int _ystride,
                           const ogg_int16_t *_ll){
   _asm {
     mov       eax,  [_pix]
     mov       edx,  [_ystride]
-    mov       ebx,  [_ll]
+    mov       ecx,  [_ll]
 
     /* _pix -= ystride */
     sub       eax,   edx
     /*  mm0=0          */
     pxor      mm0,   mm0
     /* _pix -= ystride */
     sub       eax,   edx
     /*  esi=_ystride*3 */
@@ -99,17 +99,17 @@ static void loop_filter_v(unsigned char 
     paddw     mm2, mm0
     /*"Divide" by 8.*/
     psraw     mm3, 3
     psraw     mm2, 3
     /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
     /*Free up mm5.*/
     packuswb  mm4, mm5
     /*mm0=L L L L*/
-    movq      mm0, [ebx]
+    movq      mm0, [ecx]
     /*if(R_i<-2L||R_i>2L)R_i=0:*/
     movq      mm5, mm2
     pxor      mm6, mm6
     movq      mm7, mm0
     psubw     mm6, mm0
     psllw     mm7, 1
     psllw     mm6, 1
     /*mm2==R_3 R_2 R_1 R_0*/
--- a/media/libtheora/update.sh
+++ b/media/libtheora/update.sh
@@ -48,8 +48,9 @@ cp $1/lib/dec/decode.c ./lib/dec/decode.
 cp $1/lib/dec/dequant.c ./lib/dec/dequant.c
 cp $1/lib/dec/quant.h ./lib/dec/quant.h
 cp $1/lib/dec/dequant.h ./lib/dec/dequant.h
 cp $1/lib/internal.h ./lib/internal.h
 cp $1/include/theora/theora.h ./include/theora/theora.h
 cp $1/include/theora/theoradec.h ./include/theora/theoradec.h
 cp $1/include/theora/codec.h ./include/theora/codec.h
 patch -p3 <455357_wince_local_variable_macro_clash_patch
+patch -p0 <bug498770.patch