Bug 1122900: Make libyuv compile with MSVC 2015, r=rjesup.
authorBrian Smith <brian@briansmith.org>
Tue, 31 Mar 2015 08:34:30 -1000
changeset 257870 8190c13db4204d475bc578102481cf7dafd1f03d
parent 257869 2622475e765b2ea1a937fba2107ebc4a328882ba
child 257871 aa01c3fd458c688e505b6f5c106ae4ff3bebe040
push id8007
push userraliiev@mozilla.com
push dateMon, 11 May 2015 19:23:16 +0000
treeherdermozilla-aurora@e2ce1aac996e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersrjesup
bugs1122900
milestone40.0a1
Bug 1122900: Make libyuv compile with MSVC 2015, r=rjesup. MSVC 2015 CTP 6 refuses to compile the code with these, and MSVC already aligns functions at 16 byte boundaries in its normal configuration.
media/libyuv/source/compare_win.cc
media/libyuv/source/rotate.cc
media/libyuv/source/row_win.cc
media/libyuv/source/scale_win.cc
--- a/media/libyuv/source/compare_win.cc
+++ b/media/libyuv/source/compare_win.cc
@@ -13,17 +13,17 @@
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
     mov        edx, [esp + 8]    // src_b
     mov        ecx, [esp + 12]   // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -55,17 +55,17 @@ uint32 SumSquareError_SSE2(const uint8* 
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
     mov        edx, [esp + 8]    // src_b
     mov        ecx, [esp + 12]   // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -130,17 +130,17 @@ static uvec32 kHashMul3 = {
 // 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
 // 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
 // 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
 // 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
 // 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
     _asm _emit 0x40 _asm _emit reg
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
     mov        ecx, [esp + 8]    // count
     movd       xmm0, [esp + 12]  // seed
 
     pxor       xmm7, xmm7        // constant 0 for unpck
     movdqa     xmm6, kHash16x33
@@ -182,17 +182,17 @@ uint32 HashDjb2_SSE41(const uint8* src, 
 
     movd       eax, xmm0         // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
     mov        ecx, [esp + 8]    // count
     movd       xmm0, [esp + 12]  // seed
     movdqa     xmm6, kHash16x33
 
     align      4
--- a/media/libyuv/source/rotate.cc
+++ b/media/libyuv/source/rotate.cc
@@ -71,17 +71,17 @@ void TransposeUVWx8_MIPS_DSPR2(const uin
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int width);
 #endif  // defined(__mips__)
 
 #if !defined(LIBYUV_DISABLE_X86) && \
     defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
     mov       eax, [esp + 12 + 4]   // src
     mov       edi, [esp + 12 + 8]   // src_stride
@@ -163,17 +163,17 @@ static void TransposeWx8_SSSE3(const uin
     pop       ebp
     pop       esi
     pop       edi
     ret
   }
 }
 
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                 uint8* dst_a, int dst_stride_a,
                                 uint8* dst_b, int dst_stride_b,
                                 int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
--- a/media/libyuv/source/row_win.cc
+++ b/media/libyuv/source/row_win.cc
@@ -139,17 +139,17 @@ static const uvec8 kShuffleMaskARGBToRGB
 };
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW_0 = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
     mov        ecx, [esp + 12]       // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
@@ -167,17 +167,17 @@ void I400ToARGBRow_SSE2(const uint8* src
     movdqa     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
                                   int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
     mov        ecx, [esp + 12]       // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
@@ -196,17 +196,17 @@ void I400ToARGBRow_Unaligned_SSE2(const 
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_rgb24
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRGB24ToARGB
@@ -235,17 +235,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* s
     sub       ecx, 16
     movdqa    [edx + 48], xmm3
     lea       edx, [edx + 64]
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                         int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_raw
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
@@ -282,17 +282,17 @@ void RAWToARGBRow_SSSE3(const uint8* src
 
 // pmul method to replicate bits.
 // Math to replicate bits:
 // (v << 8) | (v << 3)
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                           int pix) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
@@ -333,17 +333,17 @@ void RGB565ToARGBRow_SSE2(const uint8* s
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
 // 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
@@ -387,17 +387,17 @@ void ARGB1555ToARGBRow_SSE2(const uint8*
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
 // 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
@@ -426,17 +426,17 @@ void ARGB4444ToARGBRow_SSE2(const uint8*
     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRGB24
 
     align      4
@@ -465,17 +465,17 @@ void ARGBToRGB24Row_SSSE3(const uint8* s
     movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRAW
 
     align      4
@@ -504,17 +504,17 @@ void ARGBToRAWRow_SSSE3(const uint8* src
     movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
     psrld     xmm3, 27
     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
@@ -543,17 +543,17 @@ void ARGBToRGB565Row_SSE2(const uint8* s
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
     psrld     xmm4, 27
     movdqa    xmm5, xmm4       // generate mask 0x000003e0
@@ -585,17 +585,17 @@ void ARGBToARGB1555Row_SSE2(const uint8*
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
     psllw     xmm4, 12
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
@@ -616,17 +616,17 @@ void ARGBToARGB4444Row_SSE2(const uint8*
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
@@ -651,17 +651,17 @@ void ARGBToYRow_SSSE3(const uint8* src_a
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -688,17 +688,17 @@ void ARGBToYJRow_SSSE3(const uint8* src_
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToY
     vbroadcastf128 ymm5, kAddY16
     vmovdqa    ymm6, kPermdARGBToY_AVX
@@ -728,17 +728,17 @@ void ARGBToYRow_AVX2(const uint8* src_ar
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_ARGBTOYROW_AVX2
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToYJ
     vbroadcastf128 ymm5, kAddYJ64
     vmovdqa    ymm6, kPermdARGBToY_AVX
@@ -768,17 +768,17 @@ void ARGBToYJRow_AVX2(const uint8* src_a
     jg         convertloop
 
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
@@ -802,17 +802,17 @@ void ARGBToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -837,17 +837,17 @@ void ARGBToYJRow_Unaligned_SSSE3(const u
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
@@ -871,17 +871,17 @@ void BGRAToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
@@ -905,17 +905,17 @@ void BGRAToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
@@ -939,17 +939,17 @@ void ABGRToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
@@ -973,17 +973,17 @@ void ABGRToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
@@ -1007,17 +1007,17 @@ void RGBAToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
@@ -1041,17 +1041,17 @@ void RGBAToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1107,17 +1107,17 @@ void ARGBToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1175,17 +1175,17 @@ void ARGBToUVJRow_SSSE3(const uint8* src
 
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1241,17 +1241,17 @@ void ARGBToUVRow_AVX2(const uint8* src_a
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1311,17 +1311,17 @@ void ARGBToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1382,17 +1382,17 @@ void ARGBToUVJRow_Unaligned_SSSE3(const 
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1440,17 +1440,17 @@ void ARGBToUV444Row_SSSE3(const uint8* s
     lea        edx,  [edx + 16]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
                                     uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1498,17 +1498,17 @@ void ARGBToUV444Row_Unaligned_SSSE3(cons
     lea        edx,  [edx + 16]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1557,17 +1557,17 @@ void ARGBToUV422Row_SSSE3(const uint8* s
     lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
                                     uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1616,17 +1616,17 @@ void ARGBToUV422Row_Unaligned_SSSE3(cons
     lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1682,17 +1682,17 @@ void BGRAToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1752,17 +1752,17 @@ void BGRAToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1818,17 +1818,17 @@ void ABGRToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1888,17 +1888,17 @@ void ABGRToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1954,17 +1954,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -2072,17 +2072,17 @@ static const lvec16 kUVBiasG_AVX = {
   BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
 };
 static const lvec16 kUVBiasR_AVX = {
   BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
 };
 
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_AVX2(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2271,17 +2271,17 @@ static const vec16 kUVBiasR = { BR, BR, 
     __asm psraw      xmm2, 6                                                   \
     __asm packuswb   xmm0, xmm0           /* B */                              \
     __asm packuswb   xmm1, xmm1           /* G */                              \
     __asm packuswb   xmm2, xmm2           /* R */                              \
   }
 
 // 8 pixels, dest aligned 16.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2314,17 +2314,17 @@ void I444ToARGBRow_SSSE3(const uint8* y_
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
                           uint8* dst_rgb24,
                           int width) {
   __asm {
     push       esi
     push       edi
@@ -2361,17 +2361,17 @@ void I422ToRGB24Row_SSSE3(const uint8* y
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* dst_raw,
                         int width) {
   __asm {
     push       esi
     push       edi
@@ -2408,17 +2408,17 @@ void I422ToRAWRow_SSSE3(const uint8* y_b
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest unaligned.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb565_buf,
                            int width) {
   __asm {
     push       esi
     push       edi
@@ -2481,17 +2481,17 @@ void I422ToRGB565Row_SSSE3(const uint8* 
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2525,17 +2525,17 @@ void I422ToARGBRow_SSSE3(const uint8* y_
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       ebx
     push       esi
@@ -2570,17 +2570,17 @@ void I411ToARGBRow_SSSE3(const uint8* y_
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
@@ -2608,17 +2608,17 @@ void NV12ToARGBRow_SSSE3(const uint8* y_
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // VU
@@ -2646,17 +2646,17 @@ void NV21ToARGBRow_SSSE3(const uint8* y_
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2689,17 +2689,17 @@ void I444ToARGBRow_Unaligned_SSSE3(const
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2733,17 +2733,17 @@ void I422ToARGBRow_Unaligned_SSSE3(const
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       ebx
     push       esi
@@ -2778,17 +2778,17 @@ void I411ToARGBRow_Unaligned_SSSE3(const
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* uv_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
@@ -2816,17 +2816,17 @@ void NV12ToARGBRow_Unaligned_SSSE3(const
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* uv_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // VU
@@ -2852,17 +2852,17 @@ void NV21ToARGBRow_Unaligned_SSSE3(const
     sub        ecx, 8
     jg         convertloop
 
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_bgra,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2893,17 +2893,17 @@ void I422ToBGRARow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_bgra,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2934,17 +2934,17 @@ void I422ToBGRARow_Unaligned_SSSE3(const
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_abgr,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2975,17 +2975,17 @@ void I422ToABGRRow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_abgr,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -3016,17 +3016,17 @@ void I422ToABGRRow_Unaligned_SSSE3(const
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_rgba,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -3057,17 +3057,17 @@ void I422ToRGBARow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_rgba,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -3101,17 +3101,17 @@ void I422ToRGBARow_Unaligned_SSSE3(const
     pop        esi
     ret
   }
 }
 
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 #ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YToARGBRow_SSE2(const uint8* y_buf,
                      uint8* rgb_buf,
                      int width) {
   __asm {
     pxor       xmm5, xmm5
     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
     pslld      xmm4, 24
     mov        eax, 0x00100010
@@ -3154,17 +3154,17 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 #endif  // HAS_YTOARGBROW_SSE2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, kShuffleMirror
     lea       eax, [eax - 16]
 
@@ -3183,17 +3183,17 @@ void MirrorRow_SSSE3(const uint8* src, u
 
 #ifdef HAS_MIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
 static const ulvec8 kShuffleMirror_AVX2 = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     vmovdqa   ymm5, kShuffleMirror_AVX2
     lea       eax, [eax - 32]
 
@@ -3210,17 +3210,17 @@ void MirrorRow_AVX2(const uint8* src, ui
     ret
   }
 }
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSE2
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16]
 
     align      4
@@ -3243,17 +3243,17 @@ void MirrorRow_SSE2(const uint8* src, ui
 #endif  // HAS_MIRRORROW_SSE2
 
 #ifdef HAS_MIRRORROW_UV_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorUV = {
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                        int width) {
   __asm {
     push      edi
     mov       eax, [esp + 4 + 4]   // src
     mov       edx, [esp + 4 + 8]   // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
@@ -3279,17 +3279,17 @@ void MirrorUVRow_SSSE3(const uint8* src,
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kARGBShuffleMirror = {
   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
     movdqa    xmm5, kARGBShuffleMirror
 
@@ -3308,17 +3308,17 @@ void ARGBMirrorRow_SSSE3(const uint8* sr
 #endif  // HAS_ARGBMIRRORROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 32]
     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
 
@@ -3331,17 +3331,17 @@ void ARGBMirrorRow_AVX2(const uint8* src
     jg        convertloop
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
@@ -3367,17 +3367,17 @@ void SplitUVRow_SSE2(const uint8* src_uv
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                                int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -3406,17 +3406,17 @@ void SplitUVRow_Unaligned_SSE2(const uin
 
     pop        edi
     ret
   }
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
@@ -3445,17 +3445,17 @@ void SplitUVRow_AVX2(const uint8* src_uv
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3475,17 +3475,17 @@ void MergeUVRow_SSE2(const uint8* src_u,
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
                                uint8* dst_uv, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3507,17 +3507,17 @@ void MergeUVRow_Unaligned_SSE2(const uin
 
     pop        edi
     ret
   }
 }
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3542,17 +3542,17 @@ void MergeUVRow_AVX2(const uint8* src_u,
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
 
     align      4
   convertloop:
@@ -3565,33 +3565,33 @@ void CopyRow_SSE2(const uint8* src, uint
     sub        ecx, 32
     jg         convertloop
     ret
   }
 }
 #endif  // HAS_COPYROW_SSE2
 
 // Unaligned Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     rep movsb
     mov        edi, edx
     mov        esi, eax
     ret
   }
 }
 
 #ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
@@ -3600,17 +3600,17 @@ void CopyRow_X86(const uint8* src, uint8
     mov        esi, eax
     ret
   }
 }
 #endif  // HAS_COPYROW_X86
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
     pslld      xmm0, 24
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
@@ -3637,17 +3637,17 @@ void ARGBCopyAlphaRow_SSE2(const uint8* 
 
     ret
   }
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
@@ -3667,17 +3667,17 @@ void ARGBCopyAlphaRow_AVX2(const uint8* 
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
     pslld      xmm0, 24
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
@@ -3706,17 +3706,17 @@ void ARGBCopyYToAlphaRow_SSE2(const uint
 
     ret
   }
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
@@ -3738,32 +3738,32 @@ void ARGBCopyYToAlphaRow_AVX2(const uint
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SetRow_X86(uint8* dst, uint32 v32, int count) {
   __asm {
     mov        edx, edi
     mov        edi, [esp + 4]   // dst
     mov        eax, [esp + 8]   // v32
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
     mov        edi, edx
     ret
   }
 }
 
 // SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
                    int dst_stride, int height) {
   __asm {
     push       esi
     push       edi
     push       ebp
     mov        edi, [esp + 12 + 4]   // dst
     mov        eax, [esp + 12 + 8]   // v32
@@ -3785,17 +3785,17 @@ void ARGBSetRows_X86(uint8* dst, uint32 
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
@@ -3813,17 +3813,17 @@ void YUY2ToYRow_AVX2(const uint8* src_yu
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -3858,17 +3858,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_y
 
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -3898,17 +3898,17 @@ void YUY2ToUV422Row_AVX2(const uint8* sr
     jg         convertloop
 
     pop        edi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_AVX2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -3924,17 +3924,17 @@ void UYVYToYRow_AVX2(const uint8* src_uy
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
     ret
     vzeroupper
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -3969,17 +3969,17 @@ void UYVYToUVRow_AVX2(const uint8* src_u
 
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4011,17 +4011,17 @@ void UYVYToUV422Row_AVX2(const uint8* sr
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
@@ -4037,17 +4037,17 @@ void YUY2ToYRow_SSE2(const uint8* src_yu
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4081,17 +4081,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_y
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4118,17 +4118,17 @@ void YUY2ToUV422Row_SSE2(const uint8* sr
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                                uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
@@ -4144,17 +4144,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uin
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
                                 uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4188,17 +4188,17 @@ void YUY2ToUVRow_Unaligned_SSE2(const ui
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
                                    uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4225,17 +4225,17 @@ void YUY2ToUV422Row_Unaligned_SSE2(const
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -4249,17 +4249,17 @@ void UYVYToYRow_SSE2(const uint8* src_uy
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4293,17 +4293,17 @@ void UYVYToUVRow_SSE2(const uint8* src_u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4330,17 +4330,17 @@ void UYVYToUV422Row_SSE2(const uint8* sr
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -4354,17 +4354,17 @@ void UYVYToYRow_Unaligned_SSE2(const uin
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                 uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4398,17 +4398,17 @@ void UYVYToUVRow_Unaligned_SSE2(const ui
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
                                    uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4438,17 +4438,17 @@ void UYVYToUV422Row_Unaligned_SSE2(const
     pop        edi
     ret
   }
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4572,17 +4572,17 @@ static const uvec8 kShuffleAlpha = {
 // Same as SSE2, but replaces:
 //    psrlw      xmm3, 8          // alpha
 //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
 //    pshuflw    xmm3, xmm3, 0F5h
 // with..
 //    pshufb     xmm3, kShuffleAlpha // alpha
 // Blend 8 pixels at a time.
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4720,17 +4720,17 @@ void ARGBBlendRow_SSSE3(const uint8* src
     ret
   }
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
@@ -4770,17 +4770,17 @@ void ARGBAttenuateRow_SSE2(const uint8* 
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, kShuffleAlpha0
@@ -4818,17 +4818,17 @@ void ARGBAttenuateRow_SSSE3(const uint8*
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const ulvec8 kShuffleAlpha_AVX2 = {
   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vmovdqa    ymm4, kShuffleAlpha_AVX2
     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
@@ -4857,17 +4857,17 @@ void ARGBAttenuateRow_AVX2(const uint8* 
     ret
   }
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb0
     mov        edx, [esp + 8 + 8]   // dst_argb
     mov        ecx, [esp + 8 + 12]  // width
@@ -4913,17 +4913,17 @@ void ARGBUnattenuateRow_SSE2(const uint8
 // Shuffle table duplicating alpha.
 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
 };
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
@@ -4948,17 +4948,17 @@ void ARGBUnattenuateRow_AVX2(const uint8
     lea        eax, [eax + 32]
     jg         convertloop
 
     vzeroupper
     ret
   }
 }
 #else  // USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
 
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -5016,17 +5016,17 @@ void ARGBUnattenuateRow_AVX2(const uint8
     ret
   }
 }
 #endif  // USE_GATHER
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]  /* width */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -5076,17 +5076,17 @@ static const vec8 kARGBToSepiaG = {
   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
 };
 
 static const vec8 kARGBToSepiaR = {
   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
 };
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* dst_argb */
     mov        ecx, [esp + 8]   /* width */
     movdqa     xmm2, kARGBToSepiaB
     movdqa     xmm3, kARGBToSepiaG
     movdqa     xmm4, kARGBToSepiaR
 
@@ -5134,17 +5134,17 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb,
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                               const int8* matrix_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]  /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
@@ -5197,17 +5197,17 @@ void ARGBColorMatrixRow_SSSE3(const uint
     ret
   }
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   __asm {
     mov        eax, [esp + 4]    /* dst_argb */
     movd       xmm2, [esp + 8]   /* scale */
     movd       xmm3, [esp + 12]  /* interval_size */
     movd       xmm4, [esp + 16]  /* interval_offset */
     mov        ecx, [esp + 20]   /* width */
@@ -5244,17 +5244,17 @@ void ARGBQuantizeRow_SSE2(uint8* dst_arg
     ret
   }
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   __asm {
     mov        eax, [esp + 4]   // src_argb
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
@@ -5279,17 +5279,17 @@ void ARGBShadeRow_SSE2(const uint8* src_
 
     ret
   }
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5319,17 +5319,17 @@ void ARGBMultiplyRow_SSE2(const uint8* s
     ret
   }
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5368,17 +5368,17 @@ void ARGBAddRow_SSE2(const uint8* src_ar
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5398,17 +5398,17 @@ void ARGBSubtractRow_SSE2(const uint8* s
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5436,17 +5436,17 @@ void ARGBMultiplyRow_AVX2(const uint8* s
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5466,17 +5466,17 @@ void ARGBAddRow_AVX2(const uint8* src_ar
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5499,17 +5499,17 @@ void ARGBSubtractRow_AVX2(const uint8* s
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
 #ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_y0
     mov        esi, [esp + 8 + 8]   // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
@@ -5556,17 +5556,17 @@ void SobelXRow_SSE2(const uint8* src_y0,
 }
 #endif  // HAS_SOBELXROW_SSE2
 
 #ifdef HAS_SOBELYROW_SSE2
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_y0
     mov        esi, [esp + 4 + 8]   // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
@@ -5610,17 +5610,17 @@ void SobelYRow_SSE2(const uint8* src_y0,
 #endif  // HAS_SOBELYROW_SSE2
 
 #ifdef HAS_SOBELROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
 // A = 255
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5658,17 +5658,17 @@ void SobelRow_SSE2(const uint8* src_sobe
     pop        esi
     ret
   }
 }
 #endif  // HAS_SOBELROW_SSE2
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_y, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5692,17 +5692,17 @@ void SobelToPlaneRow_SSE2(const uint8* s
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
 #ifdef HAS_SOBELXYROW_SSE2
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
 // A = 255
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5986,17 +5986,17 @@ void ComputeCumulativeSumRow_SSE2(const 
 
  l1b:
   }
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 12]  // src_argb
     mov        esi, [esp + 16]  // stride
@@ -6073,17 +6073,17 @@ void ARGBAffineRow_SSE2(const uint8* src
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6174,17 +6174,17 @@ void InterpolateRow_AVX2(uint8* dst_ptr,
     vzeroupper
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6281,17 +6281,17 @@ void InterpolateRow_SSSE3(uint8* dst_ptr
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6393,17 +6393,17 @@ void InterpolateRow_SSE2(uint8* dst_ptr,
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                     ptrdiff_t src_stride, int dst_width,
                                     int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6499,17 +6499,17 @@ void InterpolateRow_Unaligned_SSSE3(uint
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                                    ptrdiff_t src_stride, int dst_width,
                                    int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6610,17 +6610,17 @@ void InterpolateRow_Unaligned_SSE2(uint8
   xloop99:
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // src_uv_stride
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -6635,17 +6635,17 @@ void HalfRow_SSE2(const uint8* src_uv, i
     lea        eax,  [eax + 16]
     jg         convertloop
     pop        edi
     ret
   }
 }
 
 #ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // src_uv_stride
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -6662,17 +6662,17 @@ void HalfRow_AVX2(const uint8* src_uv, i
 
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_HALFROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_bayer
     movd       xmm5, [esp + 12]  // selector
     mov        ecx, [esp + 16]   // pix
     pshufd     xmm5, xmm5, 0
@@ -6689,17 +6689,17 @@ void ARGBToBayerRow_SSSE3(const uint8* s
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
     jg         wloop
     ret
   }
 }
 
 // Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                            uint32 selector, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_bayer
                                  // selector
     mov        ecx, [esp + 16]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
@@ -6720,17 +6720,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* 
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
     jg         wloop
     ret
   }
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
@@ -6746,17 +6746,17 @@ void ARGBShuffleRow_SSSE3(const uint8* s
     movdqa     [edx], xmm0
     movdqa     [edx + 16], xmm1
     lea        edx, [edx + 32]
     jg         wloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                     const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
@@ -6773,17 +6773,17 @@ void ARGBShuffleRow_Unaligned_SSSE3(cons
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     jg         wloop
     ret
   }
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]     // src_argb
     mov        edx, [esp + 8]     // dst_argb
     mov        ecx, [esp + 12]    // shuffler
     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
     mov        ecx, [esp + 16]    // pix
@@ -6802,17 +6802,17 @@ void ARGBShuffleRow_AVX2(const uint8* sr
     jg         wloop
 
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
     push       ebx
     push       esi
     mov        eax, [esp + 8 + 4]    // src_argb
     mov        edx, [esp + 8 + 8]    // dst_argb
     mov        esi, [esp + 8 + 12]   // shuffler
@@ -6928,17 +6928,17 @@ void ARGBShuffleRow_SSE2(const uint8* sr
 }
 
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_y
@@ -6966,17 +6966,17 @@ void I422ToYUY2Row_SSE2(const uint8* src
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_y
@@ -7005,17 +7005,17 @@ void I422ToUYVYRow_SSE2(const uint8* src
 
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* src_argb */
     mov        edx, [esp + 4 + 8]   /* dst_argb */
     mov        esi, [esp + 4 + 12]  /* poly */
@@ -7065,17 +7065,17 @@ void ARGBPolynomialRow_SSE2(const uint8*
     jg         convertloop
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]   /* poly */
     vbroadcastf128 ymm4, [ecx]       // C0
@@ -7106,17 +7106,17 @@ void ARGBPolynomialRow_AVX2(const uint8*
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
                            int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* dst_argb */
     mov        esi, [esp + 4 + 8]   /* table_argb */
     mov        ecx, [esp + 4 + 12]  /* width */
 
@@ -7141,17 +7141,17 @@ void ARGBColorTableRow_X86(uint8* dst_ar
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* dst_argb */
     mov        esi, [esp + 4 + 8]   /* table_argb */
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
@@ -7173,17 +7173,17 @@ void RGBColorTableRow_X86(uint8* dst_arg
     pop        esi
     ret
   }
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                  int width,
                                  const uint8* luma, uint32 lumacoeff) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   /* src_argb */
     mov        edi, [esp + 8 + 8]   /* dst_argb */
--- a/media/libyuv/source/scale_win.cc
+++ b/media/libyuv/source/scale_win.cc
@@ -89,17 +89,17 @@ static uvec8 kShufAb2 =
   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
 
 // Scaling values for boxes of 3x2 and 2x2
 static uvec16 kScaleAb2 =
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
 
@@ -117,17 +117,17 @@ void ScaleRowDown2_SSE2(const uint8* src
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
@@ -155,17 +155,17 @@ void ScaleRowDown2Linear_SSE2(const uint
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
     mov        ecx, [esp + 4 + 16]   // dst_width
@@ -199,17 +199,17 @@ void ScaleRowDown2Box_SSE2(const uint8* 
 
     pop        esi
     ret
   }
 }
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
@@ -228,17 +228,17 @@ void ScaleRowDown2_Unaligned_SSE2(const 
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
                                         ptrdiff_t src_stride,
                                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
@@ -267,17 +267,17 @@ void ScaleRowDown2Linear_Unaligned_SSE2(
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -312,17 +312,17 @@ void ScaleRowDown2Box_Unaligned_SSE2(con
 
     pop        esi
     ret
   }
 }
 
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
@@ -345,17 +345,17 @@ void ScaleRowDown4_SSE2(const uint8* src
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_ptr
     mov        esi, [esp + 8 + 8]    // src_stride
     mov        edx, [esp + 8 + 12]   // dst_ptr
@@ -410,17 +410,17 @@ void ScaleRowDown4Box_SSE2(const uint8* 
 }
 
 // Point samples 32 pixels to 24 pixels.
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     movdqa     xmm3, kShuf0
@@ -459,17 +459,17 @@ void ScaleRowDown34_SSSE3(const uint8* s
 // xmm3 shuf 1
 // xmm4 shuf 2
 // xmm5 madd 0
 // xmm6 madd 1
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -518,17 +518,17 @@ void ScaleRowDown34_1_Box_SSSE3(const ui
 
     pop        esi
     ret
   }
 }
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -581,17 +581,17 @@ void ScaleRowDown34_0_Box_SSSE3(const ui
     pop        esi
     ret
   }
 }
 
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     movdqa     xmm4, kShuf38a
@@ -613,17 +613,17 @@ void ScaleRowDown38_SSSE3(const uint8* s
     lea        edx, [edx + 12]
     jg         xloop
 
     ret
   }
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -679,17 +679,17 @@ void ScaleRowDown38_3_Box_SSSE3(const ui
     jg         xloop
 
     pop        esi
     ret
   }
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -725,17 +725,17 @@ void ScaleRowDown38_2_Box_SSSE3(const ui
 
     pop        esi
     ret
   }
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint16* dst_ptr, int src_width,
                        int src_height) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
@@ -795,17 +795,17 @@ void ScaleAddRows_SSE2(const uint8* src_
 // TODO(fbarchard): Switch the following:
 //    xor        ebx, ebx
 //    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
 // To
 //    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 // when drmemory bug fixed.
 // https://code.google.com/p/drmemory/issues/detail?id=1396
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                            int dst_width, int x, int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
     mov        edi, [esp + 12 + 4]    // dst_ptr
     mov        esi, [esp + 12 + 8]    // src_ptr
@@ -876,17 +876,17 @@ void ScaleFilterCols_SSSE3(uint8* dst_pt
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                        int dst_width, int x, int dx) {
   __asm {
     mov        edx, [esp + 4]    // dst_ptr
     mov        eax, [esp + 8]    // src_ptr
     mov        ecx, [esp + 12]   // dst_width
 
     align      4
@@ -903,17 +903,17 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, c
     jg         wloop
 
     ret
   }
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                             ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_argb
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
@@ -930,17 +930,17 @@ void ScaleARGBRowDown2_SSE2(const uint8*
     jg         wloop
 
     ret
   }
 }
 
 // Blends 8x1 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   uint8* dst_argb, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_argb
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
@@ -960,17 +960,17 @@ void ScaleARGBRowDown2Linear_SSE2(const 
     jg         wloop
 
     ret
   }
 }
 
 // Blends 8x2 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                ptrdiff_t src_stride,
                                uint8* dst_argb, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_argb
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_argb
@@ -996,17 +996,17 @@ void ScaleARGBRowDown2Box_SSE2(const uin
 
     pop        esi
     ret
   }
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       edi
     mov        eax, [esp + 8 + 4]    // src_argb
                                      // src_stride ignored
@@ -1034,17 +1034,17 @@ void ScaleARGBRowDownEven_SSE2(const uin
     pop        edi
     pop        ebx
     ret
   }
 }
 
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
@@ -1083,17 +1083,17 @@ void ScaleARGBRowDownEvenBox_SSE2(const 
     pop        edi
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                         int dst_width, int x, int dx) {
   __asm {
     push       edi
     push       esi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
@@ -1177,17 +1177,17 @@ static uvec8 kShuffleColARGB = {
   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
 static uvec8 kShuffleFractions = {
   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
@@ -1252,17 +1252,17 @@ void ScaleARGBFilterCols_SSSE3(uint8* ds
     pop        edi
     pop        esi
     ret
   }
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx) {
   __asm {
     mov        edx, [esp + 4]    // dst_argb
     mov        eax, [esp + 8]    // src_argb
     mov        ecx, [esp + 12]   // dst_width
 
     align      4
@@ -1278,30 +1278,30 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_ar
     lea        edx, [edx + 32]
     jg         wloop
 
     ret
   }
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
     cdq                          // extend num to 64 bits
     shld       edx, eax, 16      // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
   }
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv1_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
     mov        ecx, [esp + 8]    // denom
     cdq                          // extend num to 64 bits
     shld       edx, eax, 16      // 32.16
     shl        eax, 16
     sub        eax, 0x00010001