back out bug 386585 (libpng update) due to bug 399630 (startup crash), a=stuart,Ryan,#developers
authorvladimir@pobox.com
Fri, 12 Oct 2007 13:57:04 -0700
changeset 6884 0aa4758f3001e60acc6e93f8c9b878bbc81d1097
parent 6883 97a3f175e540df0929f8b2d671e4e8154ab8b9e8
child 6885 81b7159e7fe4b4c20d97398c828886329a063da8
push idunknown
push userunknown
push dateunknown
reviewersstuart, Ryan
bugs386585, 399630
milestone1.9a9pre
back out bug 386585 (libpng update) due to bug 399630 (startup crash), a=stuart,Ryan,#developers
modules/libimg/png/CHANGES
modules/libimg/png/LICENSE
modules/libimg/png/MOZCHANGES
modules/libimg/png/Makefile.in
modules/libimg/png/README
modules/libimg/png/libpng.txt
modules/libimg/png/mozpngconf.h
modules/libimg/png/png.c
modules/libimg/png/png.h
modules/libimg/png/pngconf.h
modules/libimg/png/pngerror.c
modules/libimg/png/pnggccrd.c
modules/libimg/png/pngget.c
modules/libimg/png/pngpread.c
modules/libimg/png/pngread.c
modules/libimg/png/pngrtran.c
modules/libimg/png/pngrutil.c
modules/libimg/png/pngset.c
modules/libimg/png/pngtrans.c
modules/libimg/png/pngvcrd.c
modules/libimg/png/pngwio.c
modules/libimg/png/pngwrite.c
modules/libimg/png/pngwutil.c
--- a/modules/libimg/png/CHANGES
+++ b/modules/libimg/png/CHANGES
@@ -1475,17 +1475,17 @@ version 1.2.9beta4 [March 3, 2006]
 version 1.2.9beta5 [March 4, 2006]
   Removed trailing blanks from source files.
   Put version and date of latest change in each source file, and changed
     copyright year accordingly.
   More cleanup of configure.ac, Makefile.ac, and associated scripts.
   Restored scripts/makefile.elf which was inadvertently deleted.
 
 version 1.2.9beta6 [March 6, 2006]
-  Fixed typo (21) in configuration files.
+  Fixed typo (12) in configuration files.
 
 version 1.2.9beta7 [March 7, 2006]
   Removed libpng.vers and libpng.sym from libpng12_la_SOURCES in Makefile.am
   Fixed inconsistent #ifdef's around png_sig_bytes() and png_set_sCAL_s()
     in png.h.
   Updated makefile.elf as suggested by debian.
   Made cosmetic changes to some makefiles, adding LN_SF and other macros.
   Made some makefiles accept "exec_prefix".
@@ -1610,366 +1610,15 @@ version 1.0.19rc5, 1.2.11rc5 [June 22, 2
 
 version 1.0.19, 1.2.11 [June 26, 2006]
   None.
 
 version 1.0.20, 1.2.12 [June 27, 2006]
   Really increased sprintf buffer from 50 to 52 chars in pngrutil.c to avoid
     buffer overflow.
 
-version 1.2.13beta1 [October 2, 2006]
-  Removed AC_FUNC_MALLOC from configure.ac
-  Work around Intel-Mac compiler bug by setting PNG_NO_MMX_CODE in pngconf.h
-  Change "logical" to "bitwise" throughout documentation.
-  Detect and fix attempt to write wrong iCCP profile length.
-
-version 1.0.21, 1.2.13 [November 14, 2006]
-  Fix potential buffer overflow in sPLT chunk handler.
-  Fix Makefile.am to not try to link to noexistent files.
-  Check all exported functions for NULL png_ptr.
-
-version 1.2.14beta1 [November 17, 2006]
-  Relocated three misplaced tests for NULL png_ptr.
-  Built Makefile.in with automake-1.9.6 instead of 1.9.2.
-  Build configure with autoconf-2.60 instead of 2.59
-
-version 1.2.14beta2 [November 17, 2006]
-  Added some typecasts in png_zalloc().
-
-version 1.2.14rc1 [November 20, 2006]
-  Changed "strtod" to "png_strtod" in pngrutil.c
-
-version 1.0.22, 1.2.14    [November 27, 2006]
-  Added missing "$(srcdir)" in Makefile.am and Makefile.in
-
-version 1.2.15beta1 [December 3, 2006]
-  Generated configure with autoconf-2.61 instead of 2.60
-  Revised configure.ac to update libpng.pc and libpng-config.
-
-version 1.2.15beta2 [December 3, 2006]
-  Always export MMX asm functions, just stubs if not building pnggccrd.c
-
-version 1.2.15beta3 [December 4, 2006]
-  Add "png_bytep" typecast to profile while calculating length in pngwutil.c
-
-version 1.2.15beta4 [December 7, 2006]
-  Added scripts/CMakeLists.txt
-  Changed PNG_NO_ASSEMBLER_CODE to PNG_NO_MMX_CODE in scripts, like 1.4.0beta
-
-version 1.2.15beta5 [December 7, 2006]
-  Changed some instances of PNG_ASSEMBLER_* to PNG_MMX_* in pnggccrd.c
-  Revised scripts/CMakeLists.txt
-
-version 1.2.15beta6 [December 13, 2006]
-  Revised scripts/CMakeLists.txt and configure.ac
-
-version 1.2.15rc1 [December 18, 2006]
-  Revised scripts/CMakeLists.txt
-
-version 1.2.15rc2 [December 21, 2006]
-  Added conditional #undef jmpbuf in pngtest.c to undo #define in AIX headers.
-  Added scripts/makefile.nommx
-
-version 1.2.15rc3 [December 25, 2006]
-  Fixed shared library numbering error that was intruduced in 1.2.15beta6.
-
-version 1.2.15rc4 [December 27, 2006]
-  Fixed handling of rgb_to_gray when png_ptr->color.gray isn't set.
-
-version 1.2.15rc5 [December 31, 2006]
-  Revised handling of rgb_to_gray.
-
-version 1.0.23, 1.2.15 [January 5, 2007]
-  Added some (unsigned long) typecasts in pngtest.c to avoid printing errors.
-
-version 1.2.16beta1 [January 6, 2007]
-  Fix bugs in makefile.nommx
-
-version 1.2.16beta2 [January 16, 2007]
-  Revised scripts/CMakeLists.txt
- 
-version 1.0.24, 1.2.16 [January 31, 2007]
-  No changes.
- 
-version 1.2.17beta1 [March 6, 2007]
-  Revised scripts/CMakeLists.txt to install both shared and static libraries.
-  Deleted a redundant line from pngset.c.
- 
-version 1.2.17beta2 [April 26, 2007]
-  Relocated misplaced test for png_ptr == NULL in pngpread.c
-  Change "==" to "&" for testing PNG_RGB_TO_GRAY_ERR & PNG_RGB_TO_GRAY_WARN
-    flags.
-  Changed remaining instances of PNG_ASSEMBLER_* to PNG_MMX_*
-  Added pngerror() when write_IHDR fails in deflateInit2().
-  Added "const" to some array declarations.
-  Mention examples of libpng usage in the libpng*.txt and libpng.3 documents.
-
-version 1.2.17rc1 [May 4, 2007]
-  No changes.
-
-version 1.2.17rc2 [May 8, 2007]
-  Moved several PNG_HAVE_* macros out of PNG_INTERNAL because applications
-    calling set_unknown_chunk_location() need them.
-  Changed transformation flag from PNG_EXPAND_tRNS to PNG_EXPAND in
-    png_set_expand_gray_1_2_4_to_8().
-  Added png_ptr->unknown_chunk to hold working unknown chunk data, so it
-    can be free'ed in case of error.  Revised unknown chunk handling in
-    pngrutil.c and pngpread.c to use this structure.
- 
-version 1.2.17rc3 [May 8, 2007]
-  Revised symbol-handling in configure script.
-
-version 1.2.17rc4 [May 10, 2007]
-  Revised unknown chunk handling to avoid storing unknown critical chunks.
-
-version 1.0.25 [May 15, 2007]
-version 1.2.17 [May 15, 2007]
-  Added "png_ptr->num_trans=0" before error return in png_handle_tRNS,
-    to eliminate a vulnerability (CVE-2007-2445, CERT VU#684664)
-
-version 1.0.26 [May 15, 2007]
-version 1.2.18 [May 15, 2007]
-  Reverted the libpng-1.2.17rc3 change to symbol-handling in configure script
-
-version 1.2.19beta1 [May 18, 2007]
-  Changed "const static" to "static PNG_CONST" everywhere, mostly undoing
-    change of libpng-1.2.17beta2.  Changed other "const" to "PNG_CONST"
-  Changed some handling of unused parameters, to avoid compiler warnings.
-    "if (unused == NULL) return;" becomes "unused = unused".
-
-version 1.2.19beta2 [May 18, 2007]
-  Only use the valid bits of tRNS value in png_do_expand() (Brian Cartier)
-
-version 1.2.19beta3 [May 19, 2007]
-  Add some "png_byte" typecasts in png_check_keyword() and write new_key
-  instead of key in zTXt chunk (Kevin Ryde).
-
-version 1.2.19beta4 [May 21, 2007]
-  Add png_snprintf() function and use it in place of sprint() for improved
-    defense against buffer overflows.
-
-version 1.2.19beta5 [May 21, 2007]
-  Fixed png_handle_tRNS() to only use the valid bits of tRNS value.
-  Changed handling of more unused parameters, to avoid compiler warnings.
-  Removed some PNG_CONST in pngwutil.c to avoid compiler warnings.
-
-version 1.2.19beta6 [May 22, 2007]
-  Added some #ifdef PNG_MMX_CODE_SUPPORTED where needed in pngvcrd.c
-  Added a special "_MSC_VER" case that defines png_snprintf to _snprintf
-
-version 1.2.19beta7 [May 22, 2007]
-  Squelched png_squelch_warnings() in pnggccrd.c and added an
-    #ifdef PNG_MMX_CODE_SUPPORTED block around the declarations that caused
-    the warnings that png_squelch_warnings was squelching.
-
-version 1.2.19beta8 [May 22, 2007]
-  Removed __MMX__ from test in pngconf.h.
-
-version 1.2.19beta9 [May 23, 2007]
-  Made png_squelch_warnings() available via PNG_SQUELCH_WARNINGS macro.
-  Revised png_squelch_warnings() so it might work.
-  Updated makefile.sgcc and makefile.solaris; added makefile.solaris-x86.
-
-version 1.2.19beta10 [May 24, 2007]
-  Resquelched png_squelch_warnings(), use "__attribute__((used))" instead.
-
-version 1.2.19beta11 [May 28, 2007]
-  Return 0 from png_get_sPLT() and png_get_unknown_chunks() if png_ptr is NULL;
-    changed three remaining instances of png_strcpy() to png_strncpy() (David
-    Hill).
-  Make test for NULL row_buf at the beginning of png_do_read_transformations
-    unconditional.
-
-version 1.2.19beta12 [May 28, 2007]
-  Revised pnggccrd.c.
-
-version 1.2.19beta13 [June 14, 2007]
-  Prefer PNG_USE_PNGVCRD when _MSC_VER is defined in pngconf.h
-
-version 1.2.19beta14 [June 16, 2007]
-  Fix bug with handling of 16-bit transparency, introduced in 1.2.19beta2
-
-version 1.2.19beta15 [June 17, 2007]
-  Revised pnggccrd.c.
-
-version 1.2.19beta16 [June 18, 2007]
-  Revised pnggccrd.c again.
-  Updated contrib/gregbook.
-  Changed '#include "pnggccrd.c"' to 'include "$srcdir/pnggccrd.c"'
-    in configure.ac
-
-version 1.2.19beta17 [June 19, 2007]
-  Revised many of the makefiles, to set -DPNG_NO_MMX_CODE where needed
-    and to not use -O3 unless -DPNG_NO_MMX_CODE is also set.
-
-version 1.2.19beta18 [June 23, 2007]
-  Replaced some C++ style comments with C style comments in pnggccrd.c.
-  Copied optimized C code from pnggccrd.c to pngrutil.c, removed dependency
-    on pnggccrd.o from many makefiles.
-  Added sl and dylib to list of extensions be installed by Makefile.am
-
-version 1.2.19beta19 [June 28, 2007]
-  Fixed testing PNG_RGB_TO_GRAY_ERR & PNG_RGB_TO_GRAY_WARN in pngrtran.c
-  More cleanup of pnggccrd.c and pngvcrd.c
-
-version 1.2.19beta20 [June 29, 2007]
-  Rebuilt Makefile.in and configure using libtool-1.5.24.
-  Fixed typo in pnggccrd.c
-
-version 1.2.19beta21 [June 30, 2007]
-  More revision of pnggccrd.c
-  Added "test" target to Makefile.in and Makefile.am
-
-version 1.2.19beta22 [July 3, 2007]
-  Added info about pngrutil/pnggccrd/pngvcrd to png_get_header_version()
-  Fix type definition of dummy_value_a, b in pnggccrd.c
-
-version 1.2.19beta23 [July 10, 2007]
-  Revert change to type definition of dummy_value_a, b in pnggccrd.c
-  Make sure __PIC__ is defined in pnggccrd.c when PIC is defined.
-  Require gcc-4.1 or better to use PNG_HAVE_MMX_FILTER_ROW on x86_64 platforms
-
-version 1.2.19beta24 [July 14, 2007]
-  Added PNG_NO_READ_FILTER, PNG_NO_WRITE_FILTER, PNG_NO_WARNING macros.
-  Added contrib/pngminim to demonstrate building minimal encoder and decoder
-
-version 1.2.19beta25 [July 15, 2007]
-  Removed the new PNG_NO_READ_FILTER macro since it would make the library
-    unable to read valid PNG files, and filtering is at the heart of the
-    PNG format.
-
-version 1.2.19beta26 [July 16, 2007]
-  Changed "png_free(str)" to "png_free(png_ptr,str)" in pngrutil.c WinCE
-    code (Yves Piguet).  This bug was introduced in libpng-1.2.14.
-  Updated scripts/CMakeLists.txt
-  Relocated a misplaced #endif in pnggccrd.c
-
-version 1.2.19beta27 [July 17, 2007]
-  Fixed incorrect stride and number of bytes copied (was 4 instead of
-    6 bytes) in the cleanup loop of pnggccrd.c and pngvcrd.c for handling
-    the end of 48-bit interlaced rows (Glenn R-P).
-
-version 1.2.19beta28 [July 19, 2007]
-  Removed requirement for gcc-4.1 or better to use PNG_HAVE_MMX_FILTER_ROW
-    on x86_64 platforms
-  Added png_warning() in pngrutil.c for short iCCP, iTXt, sPLT, or zTXT chunks.
-  Revised pngtest.c so warnings are displayed regardless of PNG_NO_STDIO.
-
-version 1.2.19beta29 [July 20, 2007]
-  Fix typo in pnggccrd.c (%%eax should be %%ax in secondloop48)
-
-version 1.2.19beta30 [July 26, 2007]
-  Revised pnggccrd.c
-
-version 1.2.19beta31 [July 27, 2007]
-  Fix typos in pnggccrd.c
-
-version 1.0.27rc1 and 1.2.19rc1 [July 31, 2007]
-  Disable PNG_MMX_CODE_SUPPORTED when PNG_ASSEMBLER_CODE_SUPPORTED is off.
-  Enable PNG_MMX_READ_FILTER_* by default, except when gcc-3.x is being
-    used (they were inadvertently disabled in libpng-1.2.19beta23).
-  Fix some debugging statements in pnggccrd.c and pngrutil.c
-  Added information about disabling the MMX code in libpng documentation.
-
-version 1.0.27rc2 and 1.2.19rc2 [August 4, 2007]
-  Removed some "#if 0" blocks.
-  Made a global struct local in pngvcrd.c to make it thread safe.
-  Issue a png_error() if application attempts to transform a row tht
-    has not been initialized.
-
-version 1.0.27rc3 and 1.2.19rc3 [August 9, 2007]
-  Slightly revised pngvcrd.c
-
-version 1.0.27rc4 and 1.2.19rc4 [August 9, 2007]
-  Revised pnggccrd.c debugging change of rc1, which was broken.
-  Revised scripts/CMakeLists.txt
-  Change default to PNG_NO_GLOBAL_ARRAYS for MSVC.
-  Turn off PNG_FLAG_ROW_INIT flag when setting transforms that expand pixels.
-
-version 1.0.27rc5 and 1.2.19rc5 [August 10, 2007]
-  Fix typo (missing '"') in pnggccrd.c
-  Revise handling of png_strtod in recent versions of WINCE
-
-version 1.0.27rc6 and 1.2.19rc6 [August 15, 2007]
-  Fix typo (missing ',') in contrib/gregbook/readpng2.c
-  Undid row initialization error exit added to rc2 and rc4.
-
-version 1.0.27 and 1.2.19 [August 18, 2007]
-  Conditionally restored row initialization error exit.
-
-version 1.2.20beta01 [August 19, 2007]
-  Fixed problem with compiling pnggccrd.c on Intel-Apple platforms.
-  Added PNG_NO_ERROR_TEXT feature, with demo in contrib/pngminim
-  Removed define PNG_WARN_UNINITIALIZED_ROW 1 /* 0: warning; 1: error */
-    because it caused some trouble.
-
-version 1.2.20beta02 [August 20, 2007]
-  Avoid compiling pnggccrd.c on Intel-Apple platforms.
-
-version 1.2.20beta03 [August 20, 2007]
-  Added "/D PNG_NO_MMX_CODE" to the non-mmx builds of projects/visualc6
-    and visualc71.
-
-version 1.2.20beta04 [August 21, 2007]
-  Revised pngvcrd.c for improved efficiency (Steve Snyder).
-
-version 1.2.20rc1 [August 23, 2007]
-  Revised pngconf.h to set PNG_NO_MMX_CODE for gcc-3.x compilers.
-
-version 1.2.20rc2 [August 27, 2007]
-  Revised scripts/CMakeLists.txt
-  Revised #ifdefs to ensure one and only one of pnggccrd.c, pngvcrd.c,
-    or part of pngrutil.c is selected.
-
-version 1.2.20rc3 [August 30, 2007]
-  Remove a little more code in pngwutil.c when PNG_NO_WRITE_FILTER is selected.
-  Added /D _CRT_SECURE_NO_WARNINGS to visual6c and visualc71 projects.
-  Compile png_mmx_support() in png.c even when PNG_NO_MMX_CODE is defined.
-  Restored a "superfluous" #ifdef that was removed from 1.2.20rc2 pnggccrd.c,
-    breaking the png_mmx_support() function.
-
-version 1.2.20rc4 [September 1, 2007]
-  Removed Intel contributions (MMX, Optimized C).
-
-version 1.2.20rc5 [September 2, 2007]
-  Restored configure and Makefile.in to rc3 and put a snippet of code in
-    pnggccrd.c, to ensure configure makes the same PNG_NO_MMX_CODE selection
-
-version 1.2.20rc6 [September 2, 2007]
-  Fixed bugs in scripts/CMakeLists.txt
-  Removed pngvcrd.c references from msvc projects.
-
-version 1.0.28 and 1.2.20 [September 8, 2007]
-  Removed "(NO READ SUPPORT)" from png_get_header_version() string.
-
-version 1.2.21beta1 [September 14, 2007]
-  Fixed various mistakes reported by George Cook and Jeff Phillips:
-  logical vs bitwise NOT in pngrtran.c, bug introduced in 1.2.19rc2
-  16-bit cheap transparency expansion, bug introduced in 1.2.19beta2
-  errors with sizeof(unknown_chunk.name), bugs introduced in 1.2.19beta11
-  <= compare with unsigned var in pngset.c, should be ==.
-
-version 1.2.21beta2 [September 18, 2007]
-  Removed some extraneous typecasts.
-
-version 1.2.21rc1 [September 25, 2007]
-  Fixed potential out-of-bounds reads in png_handle_pCAL() and
-    png_handle_ztXt() ("flayer" results reported by Tavis Ormandy).
-
-version 1.2.21rc2 [September 26, 2007]
-  Fixed potential out-of-bounds reads in png_handle_sCAL(),
-    png_handle_iTXt(), and png_push_read_tEXt().
-  Remove some PNG_CONST declarations from pngwutil.c to avoid compiler warnings
-  Revised makefiles to update paths in libpng.pc properly.
-
-version 1.2.21rc3 [September 27, 2007]
-  Revised makefiles to update "Libs" in libpng.pc properly.
-
-version 1.0.29 and 1.2.21rc3 [October 4, 2007]
-  No changes.
-
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
 to subscribe)
 or to glennrp at users.sourceforge.net
 
 Glenn R-P
--- a/modules/libimg/png/LICENSE
+++ b/modules/libimg/png/LICENSE
@@ -3,18 +3,18 @@ This copy of the libpng notices is provi
 any discrepancy between this copy and the notices in the file png.h that is
 included in the libpng distribution, the latter shall prevail.
 
 COPYRIGHT NOTICE, DISCLAIMER, and LICENSE:
 
 If you modify libpng you may insert additional notices immediately following
 this sentence.
 
-libpng versions 1.2.6, August 15, 2004, through 1.2.21, October 4, 2007, are
-Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
+libpng versions 1.2.6, August 15, 2004, through 1.2.12, June 27, 2006, are
+Copyright (c) 2004, 2006 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
 
    Cosmin Truta
 
 libpng versions 1.0.7, July 1, 2000, through 1.2.5 - October 3, 2002, are
 Copyright (c) 2000-2002 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.0.6
@@ -101,9 +101,9 @@ boxes and the like:
 Also, the PNG logo (in PNG format, of course) is supplied in the
 files "pngbar.png" and "pngbar.jpg (88x31) and "pngnow.png" (98x31).
 
 Libpng is OSI Certified Open Source Software.  OSI Certified Open Source is a
 certification mark of the Open Source Initiative.
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-October 4, 2007
+June 27, 2006
--- a/modules/libimg/png/MOZCHANGES
+++ b/modules/libimg/png/MOZCHANGES
@@ -1,18 +1,14 @@
 
 Changes made to pristine png source by mozilla.org developers.
 
-2007/10/04  -- Sync with libpng-1.2.21 (removes MMX support) (bug #386585)
-
-2007/07/27  -- Enable cHRM and iCCP chunks for color management (bug #16769)
+2007/05/05  -- Zero png_ptr->num_trans on CRC error (bug 374810)
 
-2007/05/05  -- Zero png_ptr->num_trans on CRC error (bug #374810)
+2007/04/19  -- Synced with libpng-1.2.16 tree
 
-2007/04/19  -- Synced with libpng-1.2.16 tree (bug #373249)
+2007/03/20  -- Added support for APNG
 
-2007/03/20  -- Added support for APNG (bug #257197)
-
-2006/06/27  -- Synced with libpng-1.2.12 tree (bug #334110)
+2006/06/27  -- Synced with libpng-1.2.12 tree
 
-2004/10/07  -- Synced with libpng-1.2.7 tree (bug #261922)
+2004/10/07  -- Synced with libpng-1.2.7 tree
 
-2004/10/07  -- add mozpngconf.h (bug #208607)
+2004/10/07  -- add mozpngconf.h (bug 208607)
--- a/modules/libimg/png/Makefile.in
+++ b/modules/libimg/png/Makefile.in
@@ -64,17 +64,16 @@ CSRCS		= \
 		pngrutil.c \
 		pngset.c \
 		pngtrans.c \
 		pngwio.c \
 		pngwrite.c \
 		pngwtran.c \
 		pngrtran.c \
 		pngwutil.c \
-		pngvcrd.c \
 		$(NULL)
 
 EXPORTS		= png.h pngconf.h mozpngconf.h
 
 LOCAL_INCLUDES	= -I$(srcdir)
 
 FORCE_STATIC_LIB = 1
 # This library is used by other shared libs in a static build
--- a/modules/libimg/png/README
+++ b/modules/libimg/png/README
@@ -1,9 +1,9 @@
-README for libpng version 1.2.21 - October 4, 2007 (shared library 12.0)
+README for libpng version 1.2.12 - June 27, 2006 (shared library 12.0)
 See the note about version numbers near the top of png.h
 
 See INSTALL for instructions on how to install libpng.
 
 Libpng comes in several distribution formats.  Get libpng-*.tar.gz
 or libpng-*.tar.bz2 if you want UNIX-style line endings in the text
 files, or lpng*.zip if you want DOS-style line endings.
 
@@ -185,21 +185,21 @@ Files in this distribution:
                             Netware.
        wince.txt        =>  Contains instructions for downloading a Microsoft
                             Visual C++ (Windows CD Toolkit) workspace for
                             building libpng and zlib on WindowsCE
       scripts       =>  Directory containing scripts for building libpng:
        descrip.mms      =>  VMS makefile for MMS or MMK
        makefile.std     =>  Generic UNIX makefile (cc, creates static libpng.a)
        makefile.elf     =>  Linux/ELF makefile symbol versioning,
-                            gcc, creates libpng12.so.0.1.2.21)
+                            gcc, creates libpng12.so.0.1.2.12)
        makefile.linux   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.21)
+                            (gcc, creates libpng12.so.0.1.2.12)
        makefile.gcmmx   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.21,
+                            (gcc, creates libpng12.so.0.1.2.12,
                             uses assembler code tuned for Intel MMX platform)
        makefile.gcc     =>  Generic makefile (gcc, creates static libpng.a)
        makefile.knr     =>  Archaic UNIX Makefile that converts files with
                             ansi2knr (Requires ansi2knr.c from
                             ftp://ftp.cs.wisc.edu/ghost)
        makefile.aix     =>  AIX makefile
        makefile.cygwin  =>  Cygwin/gcc makefile
        makefile.darwin  =>  Darwin makefile
@@ -211,22 +211,22 @@ Files in this distribution:
        makefile.ibmc    =>  IBM C/C++ version 3.x for Win32 and OS/2 (static)
        makefile.intel   =>  Intel C/C++ version 4.0 and later
        libpng.icc       =>  Project file, IBM VisualAge/C++ 4.0 or later
        makefile.netbsd  =>  NetBSD/cc makefile, PNGGCCRD, makes libpng.so.
        makefile.ne12bsd  =>  NetBSD/cc makefile, PNGGCCRD, makes libpng12.so
        makefile.openbsd =>  OpenBSD makefile
        makefile.sgi     =>  Silicon Graphics IRIX (cc, creates static lib)
        makefile.sggcc   =>  Silicon Graphics
-                            (gcc, creates libpng12.so.0.1.2.21)
+                            (gcc, creates libpng12.so.0.1.2.12)
        makefile.sunos   =>  Sun makefile
        makefile.solaris =>  Solaris 2.X makefile
-                            (gcc, creates libpng12.so.0.1.2.21)
+                            (gcc, creates libpng12.so.0.1.2.12)
        makefile.so9     =>  Solaris 9 makefile
-                            (gcc, creates libpng12.so.0.1.2.21)
+                            (gcc, creates libpng12.so.0.1.2.12)
        makefile.32sunu  =>  Sun Ultra 32-bit makefile
        makefile.64sunu  =>  Sun Ultra 64-bit makefile
        makefile.sco     =>  For SCO OSr5  ELF and Unixware 7 with Native cc
        makefile.mips    =>  MIPS makefile
        makefile.acorn   =>  Acorn makefile
        makefile.amiga   =>  Amiga makefile
        smakefile.ppc    =>  AMIGA smakefile for SAS C V6.58/7.00 PPC
                             compiler (Requires SCOPTIONS, copied from
--- a/modules/libimg/png/libpng.txt
+++ b/modules/libimg/png/libpng.txt
@@ -1,14 +1,14 @@
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.2.21 - October 4, 2007
+ libpng version 1.2.12 - June 27, 2006
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
- Copyright (c) 1998-2007 Glenn Randers-Pehrson
+ Copyright (c) 1998-2005 Glenn Randers-Pehrson
  For conditions of distribution and use, see copyright
  notice in png.h.
 
  based on:
 
  libpng 1.0 beta 6  version 0.96 May 28, 1997
  Updated and distributed by Andreas Dilger
  Copyright (c) 1996, 1997 Andreas Dilger
@@ -28,20 +28,16 @@ This file describes how to use and modif
 (known as libpng) for your own use.  There are five sections to this
 file: introduction, structures, reading, writing, and modification and
 configuration notes for various special platforms.  In addition to this
 file, example.c is a good starting point for using the library, as
 it is heavily commented and should include everything most people
 will need.  We assume that libpng is already installed; see the
 INSTALL file for instructions on how to install libpng.
 
-For examples of libpng usage, see the files "example.c", "pngtest.c",
-and the files in the "contrib" directory, all of which are included in the
-libpng distribution.
-
 Libpng was written as a companion to the PNG specification, as a way
 of reducing the amount of time and effort it takes to support the PNG
 file format in application programs.
 
 The PNG specification (second edition), November 2003, is available as
 a W3C Recommendation and as an ISO Standard (ISO/IEC 15948:2003 (E)) at
 <http://www.w3.org/TR/2003/REC-PNG-20031110/
 The W3C and ISO documents have identical technical content.
@@ -81,17 +77,19 @@ useful for more than PNG files, and can 
 See the documentation delivered with zlib for more details.
 You can usually find the source files for the zlib utility wherever you
 find the libpng source files.
 
 Libpng is thread safe, provided the threads are using different
 instances of the structures.  Each thread should have its own
 png_struct and png_info instances, and thus its own image.
 Libpng does not protect itself against two threads using the
-same instance of a structure.
+same instance of a structure.  Note: thread safety may be defeated
+by use of some of the MMX assembler code in pnggccrd.c, which is only
+compiled when the user defines PNG_THREAD_UNSAFE_OK.
 
 II. Structures
 
 There are two main structures that are important to libpng, png_struct
 and png_info.  The first, png_struct, is an internal structure that
 will not, for the most part, be used by a user except as the first
 variable passed to every libpng function call.
 
@@ -389,17 +387,17 @@ you want to do are limited to the follow
                                 to transparency
     PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
 
 (This excludes setting a background color, doing gamma transformation,
 dithering, and setting filler.)  If this is the case, simply do this:
 
     png_read_png(png_ptr, info_ptr, png_transforms, NULL)
 
-where png_transforms is an integer containing the bitwise OR of
+where png_transforms is an integer containing the logical OR of
 some set of transformation flags.  This call is equivalent to png_read_info(),
 followed the set of transformations indicated by the transform mask,
 then png_read_image(), and finally png_read_end().
 
 (The final parameter of this call is not yet used.  Someday it might point
 to transformation parameters required by some future input transform.)
 
 You must use png_transforms and not call any png_set_transform() functions
@@ -1269,17 +1267,17 @@ When you are done, you can free all memo
    png_destroy_read_struct(&png_ptr, &info_ptr,
        &end_info);
 
 It is also possible to individually free the info_ptr members that
 point to libpng-allocated storage with the following function:
 
     png_free_data(png_ptr, info_ptr, mask, seq)
     mask - identifies data to be freed, a mask
-           containing the bitwise OR of one or
+           containing the logical OR of one or
            more of
              PNG_FREE_PLTE, PNG_FREE_TRNS,
              PNG_FREE_HIST, PNG_FREE_ICCP,
              PNG_FREE_PCAL, PNG_FREE_ROWS,
              PNG_FREE_SCAL, PNG_FREE_SPLT,
              PNG_FREE_TEXT, PNG_FREE_UNKN,
            or simply PNG_FREE_ALL
     seq  - sequence number of item to be freed
@@ -1331,17 +1329,17 @@ if you transfer responsibility for free'
 application, your application must not separately free those members.
 
 The png_free_data() function will turn off the "valid" flag for anything
 it frees.  If you need to turn the flag off for a chunk that was freed by your
 application instead of by libpng, you can use
 
     png_set_invalid(png_ptr, info_ptr, mask);
     mask - identifies the chunks to be made invalid,
-           containing the bitwise OR of one or
+           containing the logical OR of one or
            more of
              PNG_INFO_gAMA, PNG_INFO_sBIT,
              PNG_INFO_cHRM, PNG_INFO_PLTE,
              PNG_INFO_tRNS, PNG_INFO_bKGD,
              PNG_INFO_hIST, PNG_INFO_pHYs,
              PNG_INFO_oFFs, PNG_INFO_tIME,
              PNG_INFO_pCAL, PNG_INFO_sRGB,
              PNG_INFO_iCCP, PNG_INFO_sPLT,
@@ -1643,17 +1641,17 @@ July 1999 PNG specification, version 1.2
 a PNG datastream that is to be embedded in a MNG datastream).  The third
 parameter is a flag that indicates which filter type(s) are to be tested
 for each scanline.  See the PNG specification for details on the specific filter
 types.
 
 
     /* turn on or off filtering, and/or choose
        specific filters.  You can use either a single
-       PNG_FILTER_VALUE_NAME or the bitwise OR of one
+       PNG_FILTER_VALUE_NAME or the logical OR of one
        or more PNG_FILTER_NAME masks. */
     png_set_filter(png_ptr, 0,
        PNG_FILTER_NONE  | PNG_FILTER_VALUE_NONE |
        PNG_FILTER_SUB   | PNG_FILTER_VALUE_SUB  |
        PNG_FILTER_UP    | PNG_FILTER_VALUE_UP   |
        PNG_FILTER_AVE   | PNG_FILTER_VALUE_AVE  |
        PNG_FILTER_PAETH | PNG_FILTER_VALUE_PAETH|
        PNG_ALL_FILTERS);
@@ -2016,17 +2014,17 @@ transformations are permitted, enabled b
     PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
     PNG_TRANSFORM_STRIP_FILLER  Strip out filler bytes.
 
 If you have valid image data in the info structure (you can use
 png_set_rows() to put image data in the info structure), simply do this:
 
     png_write_png(png_ptr, info_ptr, png_transforms, NULL)
 
-where png_transforms is an integer containing the bitwise OR of some set of
+where png_transforms is an integer containing the logical OR of some set of
 transformation flags.  This call is equivalent to png_write_info(),
 followed the set of transformations indicated by the transform mask,
 then png_write_image(), and finally png_write_end().
 
 (The final parameter of this call is not yet used.  Someday it might point
 to transformation parameters required by some future output transform.)
 
 You must use png_transforms and not call any png_set_transform() functions
@@ -2276,17 +2274,17 @@ When you are done, you can free all memo
 
     png_destroy_write_struct(&png_ptr, &info_ptr);
 
 It is also possible to individually free the info_ptr members that
 point to libpng-allocated storage with the following function:
 
     png_free_data(png_ptr, info_ptr, mask, seq)
     mask  - identifies data to be freed, a mask
-            containing the bitwise OR of one or
+            containing the logical OR of one or
             more of
               PNG_FREE_PLTE, PNG_FREE_TRNS,
               PNG_FREE_HIST, PNG_FREE_ICCP,
               PNG_FREE_PCAL, PNG_FREE_ROWS,
               PNG_FREE_SCAL, PNG_FREE_SPLT,
               PNG_FREE_TEXT, PNG_FREE_UNKN,
             or simply PNG_FREE_ALL
     seq   - sequence number of item to be freed
@@ -2348,23 +2346,27 @@ separately, do not transfer responsibili
 because when libpng fills a png_text structure it combines these members with
 the key member, and png_free_data() will free only text_ptr.key.  Similarly,
 if you transfer responsibility for free'ing text_ptr from libpng to your
 application, your application must not separately free those members.
 For a more compact example of writing a PNG image, see the file example.c.
 
 V. Modifying/Customizing libpng:
 
-There are two issues here.  The first is changing how libpng does
+There are three issues here.  The first is changing how libpng does
 standard things like memory allocation, input/output, and error handling.
 The second deals with more complicated things like adding new chunks,
 adding new transformations, and generally changing how libpng works.
 Both of those are compile-time issues; that is, they are generally
 determined at the time the code is written, and there is rarely a need
-to provide the user with a means of changing them.
+to provide the user with a means of changing them.  The third is a
+run-time issue:  choosing between and/or tuning one or more alternate
+versions of computationally intensive routines; specifically, optimized
+assembly-language (and therefore compiler- and platform-dependent)
+versions.
 
 Memory allocation, input/output, and error handling
 
 All of the memory allocation, input/output, and error handling in libpng
 goes through callbacks that are user-settable.  The default routines are
 in pngmem.c, pngrio.c, pngwio.c, and pngerror.c, respectively.  To change
 these functions, call the appropriate png_set_*_fn() function.
 
@@ -2715,30 +2717,149 @@ can still use PNG_DEBUG to control your 
    #ifdef PNG_DEBUG
        fprintf(stderr, ...
    #endif
 
 When PNG_DEBUG = 1, the macros are defined, but only png_debug statements
 having level = 0 will be printed.  There aren't any such statements in
 this version of libpng, but if you insert some they will be printed.
 
+VI.  Runtime optimization
+
+A new feature in libpng 1.2.0 is the ability to dynamically switch between
+standard and optimized versions of some routines.  Currently these are
+limited to three computationally intensive tasks when reading PNG files:
+decoding row filters, expanding interlacing, and combining interlaced or
+transparent row data with previous row data.  Currently the optimized
+versions are available only for x86 (Intel, AMD, etc.) platforms with
+MMX support, though this may change in future versions.  (For example,
+the non-MMX assembler optimizations for zlib might become similarly
+runtime-selectable in future releases, in which case libpng could be
+extended to support them.  Alternatively, the compile-time choice of
+floating-point versus integer routines for gamma correction might become
+runtime-selectable.)
+
+Because such optimizations tend to be very platform- and compiler-dependent,
+both in how they are written and in how they perform, the new runtime code
+in libpng has been written to allow programs to query, enable, and disable
+either specific optimizations or all such optimizations.  For example, to
+enable all possible optimizations (bearing in mind that some "optimizations"
+may actually run more slowly in rare cases):
+
+    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
+       png_uint_32 mask, flags;
+
+       flags = png_get_asm_flags(png_ptr);
+       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
+       png_set_asm_flags(png_ptr, flags | mask);
+    #endif
+
+To enable only optimizations relevant to reading PNGs, use PNG_SELECT_READ
+by itself when calling png_get_asm_flagmask(); similarly for optimizing
+only writing.  To disable all optimizations:
+
+    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
+       flags = png_get_asm_flags(png_ptr);
+       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
+       png_set_asm_flags(png_ptr, flags & ~mask);
+    #endif
+
+To enable or disable only MMX-related features, use png_get_mmx_flagmask()
+in place of png_get_asm_flagmask().  The mmx version takes one additional
+parameter:
+
+    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
+       int selection = PNG_SELECT_READ | PNG_SELECT_WRITE;
+       int compilerID;
+
+       mask = png_get_mmx_flagmask(selection, &compilerID);
+    #endif
+
+On return, compilerID will indicate which version of the MMX assembler
+optimizations was compiled.  Currently two flavors exist:  Microsoft
+Visual C++ (compilerID == 1) and GNU C (a.k.a. gcc/gas, compilerID == 2).
+On non-x86 platforms or on systems compiled without MMX optimizations, a
+value of -1 is used.
+
+Note that both png_get_asm_flagmask() and png_get_mmx_flagmask() return
+all valid, settable optimization bits for the version of the library that's
+currently in use.  In the case of shared (dynamically linked) libraries,
+this may include optimizations that did not exist at the time the code was
+written and compiled.  It is also possible, of course, to enable only known,
+specific optimizations; for example:
+
+    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
+       flags = PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
+             | PNG_ASM_FLAG_MMX_READ_INTERLACE    \
+             | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
+             | PNG_ASM_FLAG_MMX_READ_FILTER_UP    \
+             | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
+             | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
+       png_set_asm_flags(png_ptr, flags);
+    #endif
+
+This method would enable only the MMX read-optimizations available at the
+time of libpng 1.2.0's release, regardless of whether a later version of
+the DLL were actually being used.  (Also note that these functions did not
+exist in versions older than 1.2.0, so any attempt to run a dynamically
+linked app on such an older version would fail.)
+
+To determine whether the processor supports MMX instructions at all, use
+the png_mmx_support() function:
+
+    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
+       mmxsupport = png_mmx_support();
+    #endif
+
+It returns -1 if MMX support is not compiled into libpng, 0 if MMX code
+is compiled but MMX is not supported by the processor, or 1 if MMX support
+is fully available.  Note that png_mmx_support(), png_get_mmx_flagmask(),
+and png_get_asm_flagmask() all may be called without allocating and ini-
+tializing any PNG structures (for example, as part of a usage screen or
+"about" box).
+
+The following code can be used to prevent an application from using the
+thread_unsafe features, even if libpng was built with PNG_THREAD_UNSAFE_OK
+defined:
+
+#if defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED) \
+  && defined(PNG_THREAD_UNSAFE_OK)
+    /* Disable thread-unsafe features of pnggccrd */
+    if (png_access_version_number() >= 10200)
+    {
+      png_uint_32 mmx_disable_mask = 0;
+      png_uint_32 asm_flags;
+
+      mmx_disable_mask |= ( PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
+                          | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
+                          | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
+                          | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH );
+      asm_flags = png_get_asm_flags(png_ptr);
+      png_set_asm_flags(png_ptr, asm_flags & ~mmx_disable_mask);
+    }
+#endif
+
+For more extensive examples of runtime querying, enabling and disabling
+of optimized features, see contrib/gregbook/readpng2.c in the libpng
+source-code distribution.
+
 VII.  MNG support
 
 The MNG specification (available at http://www.libpng.org/pub/mng) allows
 certain extensions to PNG for PNG images that are embedded in MNG datastreams.
 Libpng can support some of these extensions.  To enable them, use the
 png_permit_mng_features() function:
 
    feature_set = png_permit_mng_features(png_ptr, mask)
-   mask is a png_uint_32 containing the bitwise OR of the
+   mask is a png_uint_32 containing the logical OR of the
         features you want to enable.  These include
         PNG_FLAG_MNG_EMPTY_PLTE
         PNG_FLAG_MNG_FILTER_64
         PNG_ALL_MNG_FEATURES
-   feature_set is a png_uint_32 that is the bitwise AND of
+   feature_set is a png_uint_32 that is the logical AND of
       your mask with the set of MNG features that is
       supported by the version of libpng that you are using.
 
 It is an error to use this function when reading or writing a standalone
 PNG file with the PNG 8-byte signature.  The PNG datastream must be wrapped
 in a MNG datastream.  As a minimum, it must have the MNG 8-byte signature
 and the MHDR and MEND chunks.  Libpng does not provide support for these
 or any other MNG chunks; your application must provide its own support for
@@ -2791,23 +2912,23 @@ version with leading zero, and release n
 
 You can also check which version of png.h you used when compiling your
 application:
 
    png_uint_32 application_vn = PNG_LIBPNG_VER;
 
 IX. Y2K Compliance in libpng
 
-October 4, 2007
+June 27, 2006
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.21 are Y2K compliant.  It is my belief that earlier
+upward through 1.2.12 are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
 will hold years up to 65535.  The other two hold the date in text
 format, and will hold years up to 9999.
 
 The integer is
     "png_uint_16 year" in png_time_struct.
--- a/modules/libimg/png/mozpngconf.h
+++ b/modules/libimg/png/mozpngconf.h
@@ -37,17 +37,19 @@
 #ifndef MOZPNGCONF_H
 #define MOZPNGCONF_H
 
 #define PNG_NO_GLOBAL_ARRAYS
 
 #ifndef MOZ_PNG_READ
 #define PNG_NO_READ_SUPPORTED
 #endif
-#define PNG_NO_WARN_UNINITIALIZED_ROW
+#if defined(XP_MACOSX) && !defined(PNG_NO_MMX_CODE)
+#define PNG_NO_MMX_CODE
+#endif
 #define PNG_NO_READ_BACKGROUND
 #define PNG_NO_READ_DITHER
 #define PNG_NO_READ_INVERT
 #define PNG_NO_READ_SHIFT
 #define PNG_NO_READ_PACK
 #define PNG_NO_READ_PACKSWAP
 #define PNG_NO_READ_FILLER
 #define PNG_NO_READ_SWAP_ALPHA
@@ -64,17 +66,16 @@
 #define PNG_NO_READ_TEXT
 #define PNG_NO_READ_tIME
 #define PNG_NO_READ_UNKNOWN_CHUNKS
 #define PNG_NO_READ_USER_CHUNKS
 #define PNG_NO_READ_EMPTY_PLTE
 #define PNG_NO_READ_OPT_PLTE
 #define PNG_NO_READ_STRIP_ALPHA
 #define PNG_NO_READ_oFFs
-#define PNG_NO_SEQUENTIAL_READ_SUPPORTED
 
 #ifndef MOZ_PNG_WRITE
 #define PNG_NO_WRITE_SUPPORTED
 #else
 #define PNG_NO_WRITE_BACKGROUND
 #define PNG_NO_WRITE_DITHER
 #define PNG_NO_WRITE_INVERT
 #define PNG_NO_WRITE_SHIFT
@@ -98,31 +99,31 @@
 #define PNG_NO_WRITE_sCAL
 #define PNG_NO_WRITE_sPLT
 #define PNG_NO_WRITE_TEXT
 #define PNG_NO_WRITE_tIME
 #define PNG_NO_WRITE_UNKNOWN_CHUNKS
 #define PNG_NO_WRITE_USER_CHUNKS
 #define PNG_NO_WRITE_EMPTY_PLTE
 #define PNG_NO_WRITE_OPT_PLTE
-#define PNG_NO_WRITE_FILTER
 #define PNG_NO_WRITE_WEIGHTED_FILTER
 #define PNG_NO_WRITE_INTERLACING_SUPPORTED  /* effective libpng-1.3.0 */
 #endif
 
 #define PNG_NO_INFO_IMAGE
 #define PNG_NO_USER_MEM
 #define PNG_NO_FIXED_POINT_SUPPORTED
 #define PNG_NO_MNG_FEATURES
 #define PNG_NO_USER_TRANSFORM_PTR
 #define PNG_NO_HANDLE_AS_UNKNOWN
 #define PNG_NO_CONSOLE_IO
 #define PNG_NO_ZALLOC_ZERO
 #define PNG_NO_ERROR_NUMBERS
 #define PNG_NO_EASY_ACCESS
+#define PNG_NO_SEQUENTIAL_READ_SUPPORTED
 
 
 /* Mangle names of exported libpng functions so different libpng versions
    can coexist. It is recommended that if you do this, you give your
    library a different name such as "mozlibpng" instead of "libpng". */
 
 /* The following has been present since libpng-0.88, has never changed, and
    is unaffected by conditional compilation macros.  It will not be mangled
--- a/modules/libimg/png/png.c
+++ b/modules/libimg/png/png.c
@@ -1,37 +1,37 @@
 
 /* png.c - location for general purpose libpng functions
  *
- * Last changed in libpng 1.2.21 [October 4, 2007]
+ * Last changed in libpng 1.2.15 January 5, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  */
 
 #define PNG_INTERNAL
 #define PNG_NO_EXTERN
 #include "png.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_21 Your_png_h_is_not_version_1_2_21;
+typedef version_1_2_16 Your_png_h_is_not_version_1_2_16;
 
 /* Version information for C files.  This had better match the version
  * string defined in png.h.  */
 
 #ifdef PNG_USE_GLOBAL_ARRAYS
 /* png_libpng_ver was changed to a function in version 1.0.5c */
-PNG_CONST char png_libpng_ver[18] = PNG_LIBPNG_VER_STRING;
+const char png_libpng_ver[18] = PNG_LIBPNG_VER_STRING;
 
 #ifdef PNG_READ_SUPPORTED
 
 /* png_sig was changed to a function in version 1.0.5c */
 /* Place to hold the signature string for a PNG file. */
-PNG_CONST png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+const png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
 #endif /* PNG_READ_SUPPORTED */
 
 /* Invoke global declarations for constant strings for known chunk types */
 PNG_IHDR;
 PNG_IDAT;
 PNG_IEND;
 PNG_PLTE;
 PNG_bKGD;
@@ -54,37 +54,42 @@ PNG_zTXt;
 PNG_acTL;
 PNG_fcTL;
 PNG_fdAT;
 
 #ifdef PNG_READ_SUPPORTED
 /* arrays to facilitate easy interlacing - use pass (0 - 6) as index */
 
 /* start of interlace block */
-PNG_CONST int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
+const int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
 
 /* offset to next interlace block */
-PNG_CONST int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+const int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
 
 /* start of interlace block in the y direction */
-PNG_CONST int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
+const int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
 
 /* offset to next interlace block in the y direction */
-PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+const int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+
+/* width of interlace block (used in assembler routines only) */
+#ifdef PNG_HAVE_ASSEMBLER_COMBINE_ROW
+const int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
+#endif
 
 /* Height of interlace block.  This is not currently used - if you need
  * it, uncomment it here and in png.h
-PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
+const int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
 */
 
 /* Mask to determine which pixels are valid in a pass */
-PNG_CONST int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
+const int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 
 /* Mask to determine which pixels to overwrite while displaying */
-PNG_CONST int FARDATA png_pass_dsp_mask[]
+const int FARDATA png_pass_dsp_mask[]
    = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
 
 #endif /* PNG_READ_SUPPORTED */
 #endif /* PNG_USE_GLOBAL_ARRAYS */
 
 /* Tells libpng that we have already handled the first "num_bytes" bytes
  * of the PNG file signature.  If the PNG data is embedded into another
  * stream we can set num_bytes = 8 so that libpng will not attempt to read
@@ -127,17 +132,17 @@ png_sig_cmp(png_bytep sig, png_size_t st
       num_to_check = 8 - start;
 
    return ((int)(png_memcmp(&sig[start], &png_signature[start], num_to_check)));
 }
 
 #if defined(PNG_1_0_X) || defined(PNG_1_2_X)
 /* (Obsolete) function to check signature bytes.  It does not allow one
  * to check a partial signature.  This function might be removed in the
- * future - use png_sig_cmp().  Returns true (nonzero) if the file is PNG.
+ * future - use png_sig_cmp().  Returns true (nonzero) if the file is a PNG.
  */
 int PNGAPI
 png_check_sig(png_bytep sig, int num)
 {
   return ((int)!png_sig_cmp(sig, (png_size_t)0, (png_size_t)num));
 }
 #endif
 #endif /* PNG_READ_SUPPORTED */
@@ -483,21 +488,16 @@ if (mask & PNG_FREE_SPLT)
          info_ptr->splt_palettes_num = 0;
        }
        info_ptr->valid &= ~PNG_INFO_sPLT;
    }
 }
 #endif
 
 #if defined(PNG_UNKNOWN_CHUNKS_SUPPORTED)
-  if(png_ptr->unknown_chunk.data)
-  {
-    png_free(png_ptr, png_ptr->unknown_chunk.data);
-    png_ptr->unknown_chunk.data = NULL;
-  }
 #ifdef PNG_FREE_ME_SUPPORTED
 if ((mask & PNG_FREE_UNKN) & info_ptr->free_me)
 #else
 if (mask & PNG_FREE_UNKN)
 #endif
 {
    if (num != -1)
    {
@@ -667,91 +667,99 @@ png_convert_to_rfc1123(png_structp png_p
           ptime->second % 61);
       WideCharToMultiByte(CP_ACP, 0, time_buf, -1, png_ptr->time_buffer, 29,
           NULL, NULL);
    }
 #else
 #ifdef USE_FAR_KEYWORD
    {
       char near_time_buf[29];
-      png_snprintf6(near_time_buf,29,"%d %s %d %02d:%02d:%02d +0000",
+      sprintf(near_time_buf, "%d %s %d %02d:%02d:%02d +0000",
           ptime->day % 32, short_months[(ptime->month - 1) % 12],
           ptime->year, ptime->hour % 24, ptime->minute % 60,
           ptime->second % 61);
       png_memcpy(png_ptr->time_buffer, near_time_buf,
           29*png_sizeof(char));
    }
 #else
-   png_snprintf6(png_ptr->time_buffer,29,"%d %s %d %02d:%02d:%02d +0000",
+   sprintf(png_ptr->time_buffer, "%d %s %d %02d:%02d:%02d +0000",
        ptime->day % 32, short_months[(ptime->month - 1) % 12],
        ptime->year, ptime->hour % 24, ptime->minute % 60,
        ptime->second % 61);
 #endif
 #endif /* _WIN32_WCE */
    return ((png_charp)png_ptr->time_buffer);
 }
 #endif /* PNG_TIME_RFC1123_SUPPORTED */
 
+#if 0
+/* Signature string for a PNG file. */
+png_bytep PNGAPI
+png_sig_bytes(void)
+{
+   return ((png_bytep)"\211\120\116\107\015\012\032\012");
+}
+#endif
 #endif /* defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED) */
 
 png_charp PNGAPI
 png_get_copyright(png_structp png_ptr)
 {
-   png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) "\n libpng version 1.2.21 - October 4, 2007\n\
+   if (&png_ptr != NULL)  /* silence compiler warning about unused png_ptr */
+   return ((png_charp) "\n libpng version 1.2.16 - January 31, 2007\n\
    Copyright (c) 1998-2007 Glenn Randers-Pehrson\n\
    Copyright (c) 1996-1997 Andreas Dilger\n\
    Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.\n");
+   return ((png_charp) "");
 }
 
 /* The following return the library version as a short string in the
  * format 1.0.0 through 99.99.99zz.  To get the version of *.h files
  * used with your application, print out PNG_LIBPNG_VER_STRING, which
  * is defined in png.h.
  * Note: now there is no difference between png_get_libpng_ver() and
  * png_get_header_ver().  Due to the version_nn_nn_nn typedef guard,
  * it is guaranteed that png.c uses the correct version of png.h.
  */
 png_charp PNGAPI
 png_get_libpng_ver(png_structp png_ptr)
 {
    /* Version of *.c files used when building libpng */
-   png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) PNG_LIBPNG_VER_STRING);
+   if (&png_ptr != NULL)  /* silence compiler warning about unused png_ptr */
+      return ((png_charp) PNG_LIBPNG_VER_STRING);
+   return ((png_charp) "");
 }
 
 png_charp PNGAPI
 png_get_header_ver(png_structp png_ptr)
 {
    /* Version of *.h files used when building libpng */
-   png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) PNG_LIBPNG_VER_STRING);
+   if (&png_ptr != NULL)  /* silence compiler warning about unused png_ptr */
+      return ((png_charp) PNG_LIBPNG_VER_STRING);
+   return ((png_charp) "");
 }
 
 png_charp PNGAPI
 png_get_header_version(png_structp png_ptr)
 {
    /* Returns longer string containing both version and date */
-   png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) PNG_HEADER_VERSION_STRING
-#ifndef PNG_READ_SUPPORTED
-   "     (NO READ SUPPORT)"
-#endif
-   "\n");
+   if (&png_ptr != NULL)  /* silence compiler warning about unused png_ptr */
+      return ((png_charp) PNG_HEADER_VERSION_STRING);
+   return ((png_charp) "");
 }
 
 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
 #ifdef PNG_HANDLE_AS_UNKNOWN_SUPPORTED
 int PNGAPI
 png_handle_as_unknown(png_structp png_ptr, png_bytep chunk_name)
 {
    /* check chunk_name and return "keep" value if it's on the list, else 0 */
    int i;
    png_bytep p;
-   if(png_ptr == NULL || chunk_name == NULL || png_ptr->num_chunk_list<=0)
+   if((png_ptr == NULL && chunk_name == NULL) || png_ptr->num_chunk_list<=0)
       return 0;
    p=png_ptr->chunk_list+png_ptr->num_chunk_list*5-5;
    for (i = png_ptr->num_chunk_list; i; i--, p-=5)
       if (!png_memcmp(chunk_name, p, 4))
         return ((int)*(p+4));
    return 0;
 }
 #endif
@@ -771,25 +779,75 @@ png_access_version_number(void)
 {
    /* Version of *.c files used when building libpng */
    return((png_uint_32) PNG_LIBPNG_VER);
 }
 
 
 #if defined(PNG_READ_SUPPORTED) && defined(PNG_ASSEMBLER_CODE_SUPPORTED)
 #if !defined(PNG_1_0_X)
+#if defined(PNG_MMX_CODE_SUPPORTED)
+/* this INTERNAL function was added to libpng 1.2.0 */
+void /* PRIVATE */
+png_init_mmx_flags (png_structp png_ptr)
+{
+    if(png_ptr == NULL) return;
+    png_ptr->mmx_rowbytes_threshold = 0;
+    png_ptr->mmx_bitdepth_threshold = 0;
+
+#  if (defined(PNG_USE_PNGVCRD) || defined(PNG_USE_PNGGCCRD))
+
+    png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_COMPILED;
+
+    if (png_mmx_support() > 0) {
+        png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
+#    ifdef PNG_HAVE_ASSEMBLER_COMBINE_ROW
+                              | PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
+#    endif
+#    ifdef PNG_HAVE_ASSEMBLER_READ_INTERLACE
+                              | PNG_ASM_FLAG_MMX_READ_INTERLACE
+#    endif
+#    ifndef PNG_HAVE_ASSEMBLER_READ_FILTER_ROW
+                              ;
+#    else
+                              | PNG_ASM_FLAG_MMX_READ_FILTER_SUB
+                              | PNG_ASM_FLAG_MMX_READ_FILTER_UP
+                              | PNG_ASM_FLAG_MMX_READ_FILTER_AVG
+                              | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
+
+        png_ptr->mmx_rowbytes_threshold = PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT;
+        png_ptr->mmx_bitdepth_threshold = PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT;
+#    endif
+    } else {
+        png_ptr->asm_flags &= ~( PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
+                               | PNG_MMX_READ_FLAGS
+                               | PNG_MMX_WRITE_FLAGS );
+    }
+
+#  else /* !(PNGVCRD || PNGGCCRD) */
+
+    /* clear all MMX flags; no support is compiled in */
+    png_ptr->asm_flags &= ~( PNG_MMX_FLAGS );
+
+#  endif /* ?(PNGVCRD || PNGGCCRD) */
+}
+
+#endif /* !(PNG_MMX_CODE_SUPPORTED) */
+
 /* this function was added to libpng 1.2.0 */
+#if !defined(PNG_USE_PNGGCCRD) && \
+    !(defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD))
 int PNGAPI
 png_mmx_support(void)
 {
-   /* obsolete, to be removed from libpng-1.4.0 */
     return -1;
 }
-#endif /* PNG_1_0_X */
-#endif /* PNG_READ_SUPPORTED && PNG_ASSEMBLER_CODE_SUPPORTED */
+#endif
+#endif /* PNG_1_0_X  && PNG_ASSEMBLER_CODE_SUPPORTED */
+#endif /* PNG_READ_SUPPORTED */
 
 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
 #ifdef PNG_SIZE_T
 /* Added at libpng version 1.2.6 */
    PNG_EXTERN png_size_t PNGAPI png_convert_size PNGARG((size_t size));
 png_size_t PNGAPI
 png_convert_size(size_t size)
 {
--- a/modules/libimg/png/png.h
+++ b/modules/libimg/png/png.h
@@ -1,20 +1,20 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.2.21 - October 4, 2007
+ * libpng version 1.2.16 - January 31, 2007
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  *
  * Authors and maintainers:
  *  libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *  libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- *  libpng versions 0.97, January 1998, through 1.2.21 - October 4, 2007: Glenn
+ *  libpng versions 0.97, January 1998, through 1.2.16 - January 31, 2007: Glenn
  *  See also "Contributing Authors", below.
  *
  * Note about libpng version numbers:
  *
  *    Due to various miscommunications, unforeseen code incompatibilities
  *    and occasional factors outside the authors' control, version numbering
  *    on the library has not always been consistent and straightforward.
  *    The following table summarizes matters since version 0.89c, which was
@@ -138,37 +138,16 @@
  *    1.0.23rc1-5             10    10023  10.so.0.23[.0]
  *    1.2.15rc1-5             13    10215  12.so.0.15[.0]
  *    1.0.23                  10    10023  10.so.0.23[.0]
  *    1.2.15                  13    10215  12.so.0.15[.0]
  *    1.2.16beta1-2           13    10216  12.so.0.16[.0]
  *    1.2.16rc1               13    10216  12.so.0.16[.0]
  *    1.0.24                  10    10024  10.so.0.24[.0]
  *    1.2.16                  13    10216  12.so.0.16[.0]
- *    1.2.17beta1-2           13    10217  12.so.0.17[.0]
- *    1.0.25rc1               10    10025  10.so.0.25[.0]
- *    1.2.17rc1-3             13    10217  12.so.0.17[.0]
- *    1.0.25                  10    10025  10.so.0.25[.0]
- *    1.2.17                  13    10217  12.so.0.17[.0]
- *    1.0.26                  10    10026  10.so.0.26[.0]
- *    1.2.18                  13    10218  12.so.0.18[.0]
- *    1.2.19beta1-31          13    10219  12.so.0.19[.0]
- *    1.0.27rc1-6             10    10027  10.so.0.27[.0]
- *    1.2.19rc1-6             13    10219  12.so.0.19[.0]
- *    1.0.27                  10    10027  10.so.0.27[.0]
- *    1.2.19                  13    10219  12.so.0.19[.0]
- *    1.2.20beta01-04         13    10220  12.so.0.20[.0]
- *    1.0.28rc1-6             10    10028  10.so.0.28[.0]
- *    1.2.20rc1-6             13    10220  12.so.0.20[.0]
- *    1.0.28                  10    10028  10.so.0.28[.0]
- *    1.2.20                  13    10220  12.so.0.20[.0]
- *    1.2.21beta1-2           13    10221  12.so.0.21[.0]
- *    1.2.21rc1-3             13    10221  12.so.0.21[.0]
- *    1.0.29                  10    10029  10.so.0.29[.0]
- *    1.2.21                  13    10221  12.so.0.21[.0]
  *
  *    Henceforth the source version will match the shared-library major
  *    and minor numbers; the shared-library major version number will be
  *    used for changes in backward compatibility, as it is intended.  The
  *    PNG_LIBPNG_VER macro, which is not used within libpng but is available
  *    for applications, is an unsigned integer of the form xyyzz corresponding
  *    to the source version x.y.z (leading zeros in y and z).  Beta versions
  *    were given the previous public release number plus a letter, until
@@ -188,18 +167,18 @@
  */
 
 /*
  * COPYRIGHT NOTICE, DISCLAIMER, and LICENSE:
  *
  * If you modify libpng you may insert additional notices immediately following
  * this sentence.
  *
- * libpng versions 1.2.6, August 15, 2004, through 1.2.21, October 4, 2007, are
- * Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
+ * libpng versions 1.2.6, August 15, 2004, through 1.2.16, January 31, 2007, are
+ * Copyright (c) 2004, 2007 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.2.5
  * with the following individual added to the list of Contributing Authors:
  *
  *    Cosmin Truta
  *
  * libpng versions 1.0.7, July 1, 2000, through 1.2.5, October 3, 2002, are
  * Copyright (c) 2000-2002 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.0.6
@@ -300,23 +279,23 @@
  *
  * Thanks to Frank J. T. Wojcik for helping with the documentation.
  */
 
 /*
  * Y2K compliance in libpng:
  * =========================
  *
- *    October 4, 2007
+ *    January 31, 2007
  *
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
  *
  *    This is your unofficial assurance that libpng from version 0.71 and
- *    upward through 1.2.21 are Y2K compliant.  It is my belief that earlier
+ *    upward through 1.2.16 are Y2K compliant.  It is my belief that earlier
  *    versions were also Y2K compliant.
  *
  *    Libpng only has three year fields.  One is a 2-byte unsigned integer
  *    that will hold years up to 65535.  The other two hold the date in text
  *    format, and will hold years up to 9999.
  *
  *    The integer is
  *        "png_uint_16 year" in png_time_struct.
@@ -362,27 +341,27 @@
 
 /* This is not the place to learn how to use libpng.  The file libpng.txt
  * describes how to use libpng, and the file example.c summarizes it
  * with some code on which to build.  This file is useful for looking
  * at the actual function definitions and structure components.
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.2.21"
+#define PNG_LIBPNG_VER_STRING "1.2.16"
 #define PNG_HEADER_VERSION_STRING \
-   " libpng version 1.2.21 - October 4, 2007\n"
+   " libpng version 1.2.16 - January 31, 2007 (header)\n"
 
 #define PNG_LIBPNG_VER_SONUM   0
 #define PNG_LIBPNG_VER_DLLNUM  13
 
 /* These should match the first 3 components of PNG_LIBPNG_VER_STRING: */
 #define PNG_LIBPNG_VER_MAJOR   1
 #define PNG_LIBPNG_VER_MINOR   2
-#define PNG_LIBPNG_VER_RELEASE 21
+#define PNG_LIBPNG_VER_RELEASE 16
 /* This should match the numeric part of the final component of
  * PNG_LIBPNG_VER_STRING, omitting any leading zero: */
 
 #define PNG_LIBPNG_VER_BUILD  0
 
 /* Release Status */
 #define PNG_LIBPNG_BUILD_ALPHA    1
 #define PNG_LIBPNG_BUILD_BETA     2
@@ -400,17 +379,17 @@
 
 #define PNG_LIBPNG_BUILD_BASE_TYPE PNG_LIBPNG_BUILD_STABLE
 
 /* Careful here.  At one time, Guy wanted to use 082, but that would be octal.
  * We must not include leading zeros.
  * Versions 0.7 through 1.0.0 were in the range 0 to 100 here (only
  * version 1.0.0 was mis-numbered 100 instead of 10000).  From
  * version 1.0.1 it's    xxyyzz, where x=major, y=minor, z=release */
-#define PNG_LIBPNG_VER 10221 /* 1.2.21 */
+#define PNG_LIBPNG_VER 10216 /* 1.2.16 */
 
 #ifndef PNG_VERSION_INFO_ONLY
 /* include the compression library's header */
 #include "zlib.h"
 #endif
 
 /* include all user configurable info, including optional assembler routines */
 #include "pngconf.h"
@@ -488,33 +467,36 @@ extern "C" {
 #endif
 
 /* variables declared in png.c - only it needs to define PNG_NO_EXTERN */
 #if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN)
 /* Version information for C files, stored in png.c.  This had better match
  * the version above.
  */
 #ifdef PNG_USE_GLOBAL_ARRAYS
-PNG_EXPORT_VAR (PNG_CONST char) png_libpng_ver[18];
+PNG_EXPORT_VAR (const char) png_libpng_ver[18];
   /* need room for 99.99.99beta99z */
 #else
 #define png_libpng_ver png_get_header_ver(NULL)
 #endif
 
 #ifdef PNG_USE_GLOBAL_ARRAYS
 /* This was removed in version 1.0.5c */
 /* Structures to facilitate easy interlacing.  See png.c for more details */
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_start[7];
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_inc[7];
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_ystart[7];
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_yinc[7];
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_mask[7];
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_dsp_mask[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_start[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_inc[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_ystart[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_yinc[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_mask[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_dsp_mask[7];
+#ifdef PNG_USE_PNGGCCRD
+PNG_EXPORT_VAR (const int FARDATA) png_pass_width[7];
+#endif
 /* This isn't currently used.  If you need it, see png.c for more details.
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_height[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_height[7];
 */
 #endif
 
 #endif /* PNG_NO_EXTERN */
 
 /* Three color definitions.  The order of the red, green, and blue, (and the
  * exact size) is not important, although the size of the fields need to
  * be png_byte or png_uint_16 (as defined below).
@@ -1294,17 +1276,18 @@ struct png_struct_def
    int process_mode;                 /* what push library is currently doing */
    int cur_palette;                  /* current push library palette index */
 
 #  if defined(PNG_TEXT_SUPPORTED)
      png_size_t current_text_size;   /* current size of text input data */
      png_size_t current_text_left;   /* how much text left to read in input */
      png_charp current_text;         /* current text chunk buffer */
      png_charp current_text_ptr;     /* current location in current_text */
-#  endif /* PNG_TEXT_SUPPORTED */
+#  endif /* PNG_PROGRESSIVE_READ_SUPPORTED && PNG_TEXT_SUPPORTED */
+
 #endif /* PNG_PROGRESSIVE_READ_SUPPORTED */
 
 #if defined(__TURBOC__) && !defined(_Windows) && !defined(__FLAT__)
 /* for the Borland special 64K segment handler */
    png_bytepp offset_table_ptr;
    png_bytep offset_table;
    png_uint_16 offset_table_number;
    png_uint_16 offset_table_count;
@@ -1376,17 +1359,17 @@ struct png_struct_def
    png_fixed_point int_gamma;
 #endif
 
 /* New member added in libpng-1.0.9, ifdef'ed out in 1.0.12, enabled in 1.2.0 */
 #if defined(PNG_MNG_FEATURES_SUPPORTED)
    png_byte filter_type;
 #endif
 
-#if defined(PNG_1_0_X)
+#if defined(PNG_1_0_X) || (defined(PNG_DEBUG) && defined(PNG_USE_PNGGCCRD))
 /* New member added in libpng-1.0.10, ifdef'ed out in 1.2.0 */
    png_uint_32 row_buf_size;
 #endif
 
 /* New members added in libpng-1.2.0 */
 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
 #  if !defined(PNG_1_0_X)
 #    if defined(PNG_MMX_CODE_SUPPORTED)
@@ -1424,56 +1407,50 @@ struct png_struct_def
    png_uint_32 user_height_max;
 #endif
 
 #if defined(PNG_APNG_SUPPORTED)
    png_uint_32 apng_flags;
    png_uint_32 next_seq_num;         /* next fcTL/fdAT chunk sequence number */
    png_uint_32 first_frame_width;
    png_uint_32 first_frame_height;
+#endif
 
 #if defined(PNG_READ_APNG_SUPPORTED)
    png_uint_32 num_frames_read;      /* incremented after all image data of */
                                      /* a frame is read */
 #ifdef PNG_PROGRESSIVE_READ_SUPPORTED
    png_progressive_frame_ptr frame_info_fn; /* frame info read callback */
    png_progressive_frame_ptr frame_end_fn;  /* frame data read callback */
 #endif
 #endif
 
 #if defined(PNG_WRITE_APNG_SUPPORTED)
    png_uint_32 num_frames_to_write;
    png_uint_32 num_frames_written;
 #endif
-#endif
+
+};
 
 /* For png_struct.apng_flags: */
 #define PNG_FIRST_FRAME_HIDDEN       0x0001
 
 /* dispose_op flags from inside fcTL */
 #define PNG_DISPOSE_OP_NONE        0x00
 #define PNG_DISPOSE_OP_BACKGROUND  0x01
 #define PNG_DISPOSE_OP_PREVIOUS    0x02
 
 /* blend_op flags from inside fcTL */
 #define PNG_BLEND_OP_SOURCE        0x00
 #define PNG_BLEND_OP_OVER          0x01
 
-/* New member added in libpng-1.0.25 and 1.2.17 */
-#if defined(PNG_UNKNOWN_CHUNKS_SUPPORTED)
-   /* storage for unknown chunk that the library doesn't recognize. */
-   png_unknown_chunk unknown_chunk;
-#endif
-};
-
-
 /* This triggers a compiler error in png.c, if png.c and png.h
  * do not agree upon the version number.
  */
-typedef png_structp version_1_2_21;
+typedef png_structp version_1_2_16;
 
 typedef png_struct FAR * FAR * png_structpp;
 
 /* Here are the function definitions most commonly used.  This is not
  * the place to find out how to use libpng.  See libpng.txt for the
  * full explanation, see example.c for the summary.  This just provides
  * a simple one line description of the use of each function.
  */
@@ -2128,40 +2105,31 @@ extern PNG_EXPORT(png_voidp,png_memcpy_c
 extern PNG_EXPORT(png_voidp,png_memset_check) PNGARG((png_structp png_ptr,
    png_voidp s1, int value, png_uint_32 size));
 
 #if defined(USE_FAR_KEYWORD)  /* memory model conversion function */
 extern void *png_far_to_near PNGARG((png_structp png_ptr,png_voidp ptr,
    int check));
 #endif /* USE_FAR_KEYWORD */
 
-#ifndef PNG_NO_ERROR_TEXT
 /* Fatal error in PNG image of libpng - can't continue */
 extern PNG_EXPORT(void,png_error) PNGARG((png_structp png_ptr,
    png_const_charp error_message));
 
 /* The same, but the chunk name is prepended to the error string. */
 extern PNG_EXPORT(void,png_chunk_error) PNGARG((png_structp png_ptr,
    png_const_charp error_message));
-#else
-/* Fatal error in PNG image of libpng - can't continue */
-extern PNG_EXPORT(void,png_err) PNGARG((png_structp png_ptr));
-#endif
-
-#ifndef PNG_NO_WARNINGS
+
 /* Non-fatal error in libpng.  Can continue, but may have a problem. */
 extern PNG_EXPORT(void,png_warning) PNGARG((png_structp png_ptr,
    png_const_charp warning_message));
 
-#ifdef PNG_READ_SUPPORTED
 /* Non-fatal error in libpng, chunk name is prepended to message. */
 extern PNG_EXPORT(void,png_chunk_warning) PNGARG((png_structp png_ptr,
    png_const_charp warning_message));
-#endif /* PNG_READ_SUPPORTED */
-#endif /* PNG_NO_WARNINGS */
 
 /* The png_set_<chunk> functions are for storing values in the png_info_struct.
  * Similarly, the png_get_<chunk> calls are used to read values from the
  * png_info_struct, either storing the parameters in the passed variables, or
  * setting pointers into the png_info_struct where the data is stored.  The
  * png_get_<chunk> functions return a non-zero value if the data was available
  * in info_ptr, or return zero and do not change any of the parameters if the
  * data was not available.
@@ -2538,17 +2506,17 @@ extern PNG_EXPORT(void,png_read_frame_he
    png_infop info_ptr));
 #endif
 
 #if defined(PNG_UNKNOWN_CHUNKS_SUPPORTED)
 /* provide a list of chunks and how they are to be handled, if the built-in
    handling or default unknown chunk handling is not desired.  Any chunks not
    listed will be handled in the default manner.  The IHDR and IEND chunks
    must not be listed.
-      keep = 0: follow default behaviour
+      keep = 0: follow default behavour
            = 1: do not keep
            = 2: keep only if safe-to-copy
            = 3: keep even if unsafe-to-copy
 */
 extern PNG_EXPORT(void, png_set_keep_unknown_chunks) PNGARG((png_structp
    png_ptr, int keep, png_bytep chunk_list, int num_chunks));
 extern PNG_EXPORT(void, png_set_unknown_chunks) PNGARG((png_structp png_ptr,
    png_infop info_ptr, png_unknown_chunkp unknowns, int num_unknowns));
@@ -2626,16 +2594,20 @@ extern PNG_EXPORT(void, png_write_png) P
 #endif
 #ifndef png_debug1
 #define png_debug1(l, m, p1)
 #endif
 #ifndef png_debug2
 #define png_debug2(l, m, p1, p2)
 #endif
 
+#if 0
+extern PNG_EXPORT(png_bytep,png_sig_bytes) PNGARG((void));
+#endif
+
 extern PNG_EXPORT(png_charp,png_get_copyright) PNGARG((png_structp png_ptr));
 extern PNG_EXPORT(png_charp,png_get_header_ver) PNGARG((png_structp png_ptr));
 extern PNG_EXPORT(png_charp,png_get_header_version) PNGARG((png_structp png_ptr));
 extern PNG_EXPORT(png_charp,png_get_libpng_ver) PNGARG((png_structp png_ptr));
 
 #ifdef PNG_MNG_FEATURES_SUPPORTED
 extern PNG_EXPORT(png_uint_32,png_permit_mng_features) PNGARG((png_structp
    png_ptr, png_uint_32 mng_features_permitted));
@@ -2815,31 +2787,26 @@ extern PNG_EXPORT(void,png_save_uint_16)
 /* ************************************************************************* */
 
 /* These next functions are used internally in the code.  They generally
  * shouldn't be used unless you are writing code to add or replace some
  * functionality in libpng.  More information about most functions can
  * be found in the files where the functions are located.
  */
 
-
-/* Various modes of operation, that are visible to applications because
- * they are used for unknown chunk location.
+#if defined(PNG_INTERNAL)
+
+/* Various modes of operation.  Note that after an init, mode is set to
+ * zero automatically when the structure is created.
  */
 #define PNG_HAVE_IHDR               0x01
 #define PNG_HAVE_PLTE               0x02
 #define PNG_HAVE_IDAT               0x04
 #define PNG_AFTER_IDAT              0x08 /* Have complete zlib datastream */
 #define PNG_HAVE_IEND               0x10
-
-#if defined(PNG_INTERNAL)
-
-/* More modes of operation.  Note that after an init, mode is set to
- * zero automatically when the structure is created.
- */
 #define PNG_HAVE_gAMA               0x20
 #define PNG_HAVE_cHRM               0x40
 #define PNG_HAVE_sRGB               0x80
 #define PNG_HAVE_CHUNK_HEADER      0x100
 #define PNG_WROTE_tIME             0x200
 #define PNG_WROTE_INFO_BEFORE_PLTE 0x400
 #define PNG_BACKGROUND_IS_GRAY     0x800
 #define PNG_HAVE_PNG_SIGNATURE    0x1000
@@ -2951,75 +2918,78 @@ extern PNG_EXPORT(void,png_save_uint_16)
    integers, "value" a variable. Added to libpng-1.2.6 JB */
 #define PNG_OUT_OF_RANGE(value, ideal, delta) \
         ( (value) < (ideal)-(delta) || (value) > (ideal)+(delta) )
 
 /* variables declared in png.c - only it needs to define PNG_NO_EXTERN */
 #if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN)
 /* place to hold the signature string for a PNG file. */
 #ifdef PNG_USE_GLOBAL_ARRAYS
-   PNG_EXPORT_VAR (PNG_CONST png_byte FARDATA) png_sig[8];
+   PNG_EXPORT_VAR (const png_byte FARDATA) png_sig[8];
 #else
+#if 0
+#define png_sig png_sig_bytes(NULL)
+#endif
 #endif
 #endif /* PNG_NO_EXTERN */
 
 /* Constant strings for known chunk types.  If you need to add a chunk,
  * define the name here, and add an invocation of the macro in png.c and
  * wherever it's needed.
  */
-#define PNG_IHDR png_byte png_IHDR[5] = { 73,  72,  68,  82, '\0'}
-#define PNG_IDAT png_byte png_IDAT[5] = { 73,  68,  65,  84, '\0'}
-#define PNG_IEND png_byte png_IEND[5] = { 73,  69,  78,  68, '\0'}
-#define PNG_PLTE png_byte png_PLTE[5] = { 80,  76,  84,  69, '\0'}
-#define PNG_bKGD png_byte png_bKGD[5] = { 98,  75,  71,  68, '\0'}
-#define PNG_cHRM png_byte png_cHRM[5] = { 99,  72,  82,  77, '\0'}
-#define PNG_gAMA png_byte png_gAMA[5] = {103,  65,  77,  65, '\0'}
-#define PNG_hIST png_byte png_hIST[5] = {104,  73,  83,  84, '\0'}
-#define PNG_iCCP png_byte png_iCCP[5] = {105,  67,  67,  80, '\0'}
-#define PNG_iTXt png_byte png_iTXt[5] = {105,  84,  88, 116, '\0'}
-#define PNG_oFFs png_byte png_oFFs[5] = {111,  70,  70, 115, '\0'}
-#define PNG_pCAL png_byte png_pCAL[5] = {112,  67,  65,  76, '\0'}
-#define PNG_sCAL png_byte png_sCAL[5] = {115,  67,  65,  76, '\0'}
-#define PNG_pHYs png_byte png_pHYs[5] = {112,  72,  89, 115, '\0'}
-#define PNG_sBIT png_byte png_sBIT[5] = {115,  66,  73,  84, '\0'}
-#define PNG_sPLT png_byte png_sPLT[5] = {115,  80,  76,  84, '\0'}
-#define PNG_sRGB png_byte png_sRGB[5] = {115,  82,  71,  66, '\0'}
-#define PNG_tEXt png_byte png_tEXt[5] = {116,  69,  88, 116, '\0'}
-#define PNG_tIME png_byte png_tIME[5] = {116,  73,  77,  69, '\0'}
-#define PNG_tRNS png_byte png_tRNS[5] = {116,  82,  78,  83, '\0'}
-#define PNG_zTXt png_byte png_zTXt[5] = {122,  84,  88, 116, '\0'}
-#define PNG_acTL png_byte png_acTL[5] = { 97,  99,  84,  76, '\0'}
-#define PNG_fcTL png_byte png_fcTL[5] = {102,  99,  84,  76, '\0'}
-#define PNG_fdAT png_byte png_fdAT[5] = {102, 100,  65,  84, '\0'}
+#define PNG_IHDR const png_byte png_IHDR[5] = { 73,  72,  68,  82, '\0'}
+#define PNG_IDAT const png_byte png_IDAT[5] = { 73,  68,  65,  84, '\0'}
+#define PNG_IEND const png_byte png_IEND[5] = { 73,  69,  78,  68, '\0'}
+#define PNG_PLTE const png_byte png_PLTE[5] = { 80,  76,  84,  69, '\0'}
+#define PNG_bKGD const png_byte png_bKGD[5] = { 98,  75,  71,  68, '\0'}
+#define PNG_cHRM const png_byte png_cHRM[5] = { 99,  72,  82,  77, '\0'}
+#define PNG_gAMA const png_byte png_gAMA[5] = {103,  65,  77,  65, '\0'}
+#define PNG_hIST const png_byte png_hIST[5] = {104,  73,  83,  84, '\0'}
+#define PNG_iCCP const png_byte png_iCCP[5] = {105,  67,  67,  80, '\0'}
+#define PNG_iTXt const png_byte png_iTXt[5] = {105,  84,  88, 116, '\0'}
+#define PNG_oFFs const png_byte png_oFFs[5] = {111,  70,  70, 115, '\0'}
+#define PNG_pCAL const png_byte png_pCAL[5] = {112,  67,  65,  76, '\0'}
+#define PNG_sCAL const png_byte png_sCAL[5] = {115,  67,  65,  76, '\0'}
+#define PNG_pHYs const png_byte png_pHYs[5] = {112,  72,  89, 115, '\0'}
+#define PNG_sBIT const png_byte png_sBIT[5] = {115,  66,  73,  84, '\0'}
+#define PNG_sPLT const png_byte png_sPLT[5] = {115,  80,  76,  84, '\0'}
+#define PNG_sRGB const png_byte png_sRGB[5] = {115,  82,  71,  66, '\0'}
+#define PNG_tEXt const png_byte png_tEXt[5] = {116,  69,  88, 116, '\0'}
+#define PNG_tIME const png_byte png_tIME[5] = {116,  73,  77,  69, '\0'}
+#define PNG_tRNS const png_byte png_tRNS[5] = {116,  82,  78,  83, '\0'}
+#define PNG_zTXt const png_byte png_zTXt[5] = {122,  84,  88, 116, '\0'}
+#define PNG_acTL const png_byte png_acTL[5] = { 97,  99,  84,  76, '\0'}
+#define PNG_fcTL const png_byte png_fcTL[5] = {102,  99,  84,  76, '\0'}
+#define PNG_fdAT const png_byte png_fdAT[5] = {102, 100,  65,  84, '\0'}
 
 #ifdef PNG_USE_GLOBAL_ARRAYS
-PNG_EXPORT_VAR (png_byte FARDATA) png_IHDR[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_IDAT[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_IEND[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_PLTE[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_bKGD[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_cHRM[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_gAMA[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_hIST[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_iCCP[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_iTXt[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_oFFs[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_pCAL[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_sCAL[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_pHYs[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_sBIT[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_sPLT[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_sRGB[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_tEXt[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_tIME[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_tRNS[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_zTXt[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_acTL[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_fcTL[5];
-PNG_EXPORT_VAR (png_byte FARDATA) png_fdAT[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_IHDR[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_IDAT[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_IEND[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_PLTE[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_bKGD[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_cHRM[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_gAMA[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_hIST[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_iCCP[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_iTXt[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_oFFs[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_pCAL[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_sCAL[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_pHYs[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_sBIT[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_sPLT[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_sRGB[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_tEXt[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_tIME[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_tRNS[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_zTXt[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_acTL[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_fcTL[5];
+PNG_EXPORT_VAR (const png_byte FARDATA) png_fdAT[5];
 #endif /* PNG_USE_GLOBAL_ARRAYS */
 
 #if defined(PNG_1_0_X) || defined (PNG_1_2_X)
 /* Initialize png_ptr struct for reading, and allocate any other memory.
  * (old interface - DEPRECATED - use png_create_read_struct instead).
  */
 extern PNG_EXPORT(void,png_read_init) PNGARG((png_structp png_ptr));
 #undef png_read_init
--- a/modules/libimg/png/pngconf.h
+++ b/modules/libimg/png/pngconf.h
@@ -1,12 +1,12 @@
 
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng version 1.2.21 - October 4, 2007
+ * libpng version 1.2.16 - January 31, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  */
 
 /* Any machine specific code is near the front of this file, so if you
  * are configuring libpng for a machine, you may want to read the section
@@ -72,21 +72,16 @@
 # define PNG_USER_PRIVATEBUILD PRIVATEBUILD
 #endif
 #endif /* __STDC__ */
 
 #ifndef PNG_VERSION_INFO_ONLY
 
 /* End of material added to libpng-1.2.8 */
 
-/* Added at libpng-1.2.19, removed at libpng-1.2.20 because it caused trouble
-   Restored at libpng-1.2.21 */
-/* #    define PNG_WARN_UNINITIALIZED_ROW 1 */
-/* End of material added at libpng-1.2.19/1.2.21 */
-
 /* This is the size of the compression buffer, and thus the size of
  * an IDAT chunk.  Make this whatever size you feel is best for your
  * machine.  One of these will be allocated per png_struct.  When this
  * is full, it writes the data to the disk, and does some other
  * calculations.  Making this an extremely small size will slow
  * the library down, but you may want to experiment to determine
  * where it becomes significant, if you are concerned with memory
  * usage.  Note that zlib allocates at least 32Kb also.  For readers,
@@ -278,17 +273,16 @@
 #  define PNGARG(arglist) ()
 #  ifndef PNG_TYPECAST_NULL
 #     define PNG_TYPECAST_NULL
 #  endif
 #else
 #  define PNGARG(arglist) arglist
 #endif /* _NO_PROTO */
 
-
 #endif /* OF */
 
 #endif /* PNGARG */
 
 /* Try to determine if we are compiling on a Mac.  Note that testing for
  * just __MWERKS__ is not good enough, because the Codewarrior is now used
  * on non-Mac platforms.
  */
@@ -597,26 +591,26 @@
 #    define PNG_READ_USER_TRANSFORM_SUPPORTED
 #  endif
 #  ifndef PNG_NO_READ_RGB_TO_GRAY
 #    define PNG_READ_RGB_TO_GRAY_SUPPORTED
 #  endif
 #endif /* PNG_READ_TRANSFORMS_SUPPORTED */
 
 #if !defined(PNG_NO_PROGRESSIVE_READ) && \
- !defined(PNG_PROGRESSIVE_READ_SUPPORTED) /* if you don't do progressive   */
-#  define PNG_PROGRESSIVE_READ_SUPPORTED  /* reading.  This is not talking */
-#endif                            /* about interlacing capability!  You'll */
-           /* still have interlacing unless you change the following line: */
+ !defined(PNG_PROGRESSIVE_READ_NOT_SUPPORTED)  /* if you don't do progressive */
+#  define PNG_PROGRESSIVE_READ_SUPPORTED     /* reading.  This is not talking */
+#endif                               /* about interlacing capability!  You'll */
+              /* still have interlacing unless you change the following line: */
 
-#define PNG_READ_INTERLACING_SUPPORTED /* required in PNG-compliant decoders */
+#define PNG_READ_INTERLACING_SUPPORTED /* required for PNG-compliant decoders */
 
 #ifndef PNG_NO_READ_COMPOSITE_NODIV
 #  ifndef PNG_NO_READ_COMPOSITED_NODIV  /* libpng-1.0.x misspelling */
-#    define PNG_READ_COMPOSITE_NODIV_SUPPORTED  /* well tested on Intel, SGI */
+#    define PNG_READ_COMPOSITE_NODIV_SUPPORTED   /* well tested on Intel, SGI */
 #  endif
 #endif
 
 #if defined(PNG_1_0_X) || defined (PNG_1_2_X)
 /* Deprecated, will be removed from version 2.0.0.
    Use PNG_MNG_FEATURES_SUPPORTED instead. */
 #ifndef PNG_NO_READ_EMPTY_PLTE
 #  define PNG_READ_EMPTY_PLTE_SUPPORTED
@@ -724,56 +718,47 @@
  * png_get_x_offset_microns()
  * png_get_y_offset_microns()
  */
 #if !defined(PNG_NO_EASY_ACCESS) && !defined(PNG_EASY_ACCESS_SUPPORTED)
 #  define PNG_EASY_ACCESS_SUPPORTED
 #endif
 
 /* PNG_ASSEMBLER_CODE was enabled by default in version 1.2.0 
- * and removed from version 1.2.20.  The following will be removed
- * from libpng-1.4.0
+ * even when PNG_USE_PNGVCRD or PNG_USE_PNGGCCRD is not defined.
+ *
+ * PNG_NO_ASSEMBLER_CODE disables use of all assembler code and optimized C,
+ * and removes or includes several functions in the API.
+ *
+ * PNG_NO_MMX_CODE disables the use of MMX code without changing the API.
+ * When MMX code is off, then optimized C replacement functions are used.
 */
-
-#if defined(PNG_READ_SUPPORTED) && !defined(PNG_NO_OPTIMIZED_CODE)
-#  ifndef PNG_OPTIMIZED_CODE_SUPPORTED
-#    define PNG_OPTIMIZED_CODE_SUPPORTED
-#  endif
-#endif
-
 #if defined(PNG_READ_SUPPORTED) && !defined(PNG_NO_ASSEMBLER_CODE)
 #  ifndef PNG_ASSEMBLER_CODE_SUPPORTED
 #    define PNG_ASSEMBLER_CODE_SUPPORTED
 #  endif
-
-#  if defined(__GNUC__) && defined(__x86_64__) && (__GNUC__ < 4)
-     /* work around 64-bit gcc compiler bugs in gcc-3.x */
-#    if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
-#      define PNG_NO_MMX_CODE
-#    endif
+#  if defined(XP_MACOSX) && !defined(PNG_NO_MMX_CODE)
+     /* work around Intel-Mac compiler bug */
+#    define PNG_NO_MMX_CODE
 #  endif
-
-#  if defined(__APPLE__)
-#    if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
-#      define PNG_NO_MMX_CODE
-#    endif
-#  endif
-
-#  if (defined(__MWERKS__) && ((__MWERKS__ < 0x0900) || macintosh))
-#    if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
-#      define PNG_NO_MMX_CODE
-#    endif
-#  endif
-
-#  if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
+#  if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE) && \
+     defined(__MMX__)
 #    define PNG_MMX_CODE_SUPPORTED
 #  endif
-
+#  if !defined(PNG_USE_PNGGCCRD) && !defined(PNG_NO_MMX_CODE) && \
+     !defined(PNG_USE_PNGVCRD) && defined(__MMX__)
+#    define PNG_USE_PNGGCCRD
+#  endif
 #endif
-/* end of obsolete code to be removed from libpng-1.4.0 */
+
+/* If you are sure that you don't need thread safety and you are compiling
+   with PNG_USE_PNGCCRD for an MMX application, you can define this for
+   faster execution.  See pnggccrd.c.
+#define PNG_THREAD_UNSAFE_OK
+*/
 
 #if !defined(PNG_1_0_X)
 #if !defined(PNG_NO_USER_MEM) && !defined(PNG_USER_MEM_SUPPORTED)
 #  define PNG_USER_MEM_SUPPORTED
 #endif
 #endif /* PNG_1_0_X */
 
 /* Added at libpng-1.2.6 */
@@ -1295,18 +1280,17 @@ typedef z_stream FAR *  png_zstreamp;
 #  endif
 #endif
 
 /* Do not use global arrays (helps with building DLL's)
  * They are no longer used in libpng itself, since version 1.0.5c,
  * but might be required for some pre-1.0.5c applications.
  */
 #if !defined(PNG_USE_LOCAL_ARRAYS) && !defined(PNG_USE_GLOBAL_ARRAYS)
-#  if defined(PNG_NO_GLOBAL_ARRAYS) || \
-      (defined(__GNUC__) && defined(PNG_DLL)) || defined(_MSC_VER)
+#  if defined(PNG_NO_GLOBAL_ARRAYS) || (defined(__GNUC__) && defined(PNG_DLL))
 #    define PNG_USE_LOCAL_ARRAYS
 #  else
 #    define PNG_USE_GLOBAL_ARRAYS
 #  endif
 #endif
 
 #if defined(__CYGWIN__)
 #  undef PNGAPI
@@ -1433,47 +1417,25 @@ typedef z_stream FAR *  png_zstreamp;
 #endif
 
 #if defined(USE_FAR_KEYWORD)  /* memory model independent fns */
 /* use this to make far-to-near assignments */
 #  define CHECK   1
 #  define NOCHECK 0
 #  define CVT_PTR(ptr) (png_far_to_near(png_ptr,ptr,CHECK))
 #  define CVT_PTR_NOCHECK(ptr) (png_far_to_near(png_ptr,ptr,NOCHECK))
-#  define png_snprintf _fsnprintf   /* Added to v 1.2.19 */
 #  define png_strcpy  _fstrcpy
 #  define png_strncpy _fstrncpy   /* Added to v 1.2.6 */
 #  define png_strlen  _fstrlen
 #  define png_memcmp  _fmemcmp    /* SJT: added */
 #  define png_memcpy  _fmemcpy
 #  define png_memset  _fmemset
 #else /* use the usual functions */
 #  define CVT_PTR(ptr)         (ptr)
 #  define CVT_PTR_NOCHECK(ptr) (ptr)
-#  ifndef PNG_NO_SNPRINTF
-#    ifdef _MSC_VER
-#      define png_snprintf _snprintf   /* Added to v 1.2.19 */
-#      define png_snprintf2 _snprintf
-#      define png_snprintf6 _snprintf
-#    else
-#      define png_snprintf snprintf   /* Added to v 1.2.19 */
-#      define png_snprintf2 snprintf
-#      define png_snprintf6 snprintf
-#    endif
-#  else
-     /* You don't have or don't want to use snprintf().  Caution: Using
-      * sprintf instead of snprintf exposes your application to accidental
-      * or malevolent buffer overflows.  If you don't have snprintf()
-      * as a general rule you should provide one (you can get one from
-      * Portable OpenSSH). */
-#    define png_snprintf(s1,n,fmt,x1) sprintf(s1,fmt,x1)
-#    define png_snprintf2(s1,n,fmt,x1,x2) sprintf(s1,fmt,x1,x2)
-#    define png_snprintf6(s1,n,fmt,x1,x2,x3,x4,x5,x6) \
-        sprintf(s1,fmt,x1,x2,x3,x4,x5,x6)
-#  endif
 #  define png_strcpy  strcpy
 #  define png_strncpy strncpy     /* Added to v 1.2.6 */
 #  define png_strlen  strlen
 #  define png_memcmp  memcmp      /* SJT: added */
 #  define png_memcpy  memcpy
 #  define png_memset  memset
 #endif
 /* End of memory model independent support */
@@ -1481,12 +1443,53 @@ typedef z_stream FAR *  png_zstreamp;
 /* Just a little check that someone hasn't tried to define something
  * contradictory.
  */
 #if (PNG_ZBUF_SIZE > 65536L) && defined(PNG_MAX_MALLOC_64K)
 #  undef PNG_ZBUF_SIZE
 #  define PNG_ZBUF_SIZE 65536L
 #endif
 
+#ifdef PNG_READ_SUPPORTED
+/* Prior to libpng-1.0.9, this block was in pngasmrd.h */
+#if defined(PNG_INTERNAL)
+
+/* These are the default thresholds before the MMX code kicks in; if either
+ * rowbytes or bitdepth is below the threshold, plain C code is used.  These
+ * can be overridden at runtime via the png_set_mmx_thresholds() call in
+ * libpng 1.2.0 and later.  The values below were chosen by Intel.
+ */
+
+#ifndef PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT
+#  define PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT  128  /*  >=  */
+#endif
+#ifndef PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT
+#  define PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT  9    /*  >=  */   
+#endif
+
+/* Set this in the makefile for VC++ on Pentium, not here. */
+/* Platform must be Pentium.  Makefile must assemble and load pngvcrd.c .
+ * MMX will be detected at run time and used if present.
+ */
+#ifdef PNG_USE_PNGVCRD
+#  define PNG_HAVE_ASSEMBLER_COMBINE_ROW
+#  define PNG_HAVE_ASSEMBLER_READ_INTERLACE
+#  define PNG_HAVE_ASSEMBLER_READ_FILTER_ROW
+#endif
+
+/* Set this in the makefile for gcc/as on Pentium, not here. */
+/* Platform must be Pentium.  Makefile must assemble and load pnggccrd.c .
+ * MMX will be detected at run time and used if present.
+ */
+#ifdef PNG_USE_PNGGCCRD
+#  define PNG_HAVE_ASSEMBLER_COMBINE_ROW
+#  define PNG_HAVE_ASSEMBLER_READ_INTERLACE
+#  define PNG_HAVE_ASSEMBLER_READ_FILTER_ROW
+#endif
+/* - see pnggccrd.c for info about what is currently enabled */
+
+#endif /* PNG_INTERNAL */
+#endif /* PNG_READ_SUPPORTED */
+
 /* Added at libpng-1.2.8 */
 #endif /* PNG_VERSION_INFO_ONLY */
 
 #endif /* PNGCONF_H */
--- a/modules/libimg/png/pngerror.c
+++ b/modules/libimg/png/pngerror.c
@@ -1,42 +1,39 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * Last changed in libpng 1.2.20 October 4, 2007
+ * Last changed in libpng 1.2.13 November 13, 2006
  * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
+ * Copyright (c) 1998-2006 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  *
  * This file provides a location for all error handling.  Users who
  * need special error handling are expected to write replacement functions
  * and use png_set_error_fn() to use those functions.  See the instructions
  * at each function.
  */
 
 #define PNG_INTERNAL
 #include "png.h"
 
 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
 static void /* PRIVATE */
 png_default_error PNGARG((png_structp png_ptr,
   png_const_charp error_message));
-#ifndef PNG_NO_WARNINGS
 static void /* PRIVATE */
 png_default_warning PNGARG((png_structp png_ptr,
   png_const_charp warning_message));
-#endif /* PNG_NO_WARNINGS */
 
 /* This function is called whenever there is a fatal error.  This function
  * should not be changed.  If there is a need to handle errors differently,
  * you should supply a replacement error function and use png_set_error_fn()
  * to replace the error function at run-time.
  */
-#ifndef PNG_NO_ERROR_TEXT
 void PNGAPI
 png_error(png_structp png_ptr, png_const_charp error_message)
 {
 #ifdef PNG_ERROR_NUMBERS_SUPPORTED
    char msg[16];
    if (png_ptr != NULL)
    {
      if (png_ptr->flags&
@@ -73,30 +70,17 @@ png_error(png_structp png_ptr, png_const
 #endif
    if (png_ptr != NULL && png_ptr->error_fn != NULL)
       (*(png_ptr->error_fn))(png_ptr, error_message);
 
    /* If the custom handler doesn't exist, or if it returns,
       use the default handler, which will not return. */
    png_default_error(png_ptr, error_message);
 }
-#else
-void PNGAPI
-png_err(png_structp png_ptr)
-{
-   if (png_ptr != NULL && png_ptr->error_fn != NULL)
-      (*(png_ptr->error_fn))(png_ptr, '\0');
 
-   /* If the custom handler doesn't exist, or if it returns,
-      use the default handler, which will not return. */
-   png_default_error(png_ptr, '\0');
-}
-#endif /* PNG_NO_ERROR_TEXT */
-
-#ifndef PNG_NO_WARNINGS
 /* This function is called whenever there is a non-fatal error.  This function
  * should not be changed.  If there is a need to handle warnings differently,
  * you should supply a replacement warning function and use
  * png_set_error_fn() to replace the warning function at run-time.
  */
 void PNGAPI
 png_warning(png_structp png_ptr, png_const_charp warning_message)
 {
@@ -116,32 +100,29 @@ png_warning(png_structp png_ptr, png_con
        }
      }
      if (png_ptr != NULL && png_ptr->warning_fn != NULL)
         (*(png_ptr->warning_fn))(png_ptr, warning_message+offset);
    }
    else
       png_default_warning(png_ptr, warning_message+offset);
 }
-#endif /* PNG_NO_WARNINGS */
-
 
 /* These utilities are used internally to build an error message that relates
  * to the current chunk.  The chunk name comes from png_ptr->chunk_name,
  * this is used to prefix the message.  The message is limited in length
  * to 63 bytes, the name characters are output as hex digits wrapped in []
  * if the character is invalid.
  */
 #define isnonalpha(c) ((c) < 65 || (c) > 122 || ((c) > 90 && (c) < 97))
 static PNG_CONST char png_digit[16] = {
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'B', 'C', 'D', 'E', 'F'
 };
 
-#if !defined(PNG_NO_WARNINGS) || !defined(PNG_NO_ERROR_TEXT)
 static void /* PRIVATE */
 png_format_buffer(png_structp png_ptr, png_charp buffer, png_const_charp
    error_message)
 {
    int iout = 0, iin = 0;
 
    while (iin < 4)
    {
@@ -165,47 +146,41 @@ png_format_buffer(png_structp png_ptr, p
    {
       buffer[iout++] = ':';
       buffer[iout++] = ' ';
       png_strncpy(buffer+iout, error_message, 63);
       buffer[iout+63] = 0;
    }
 }
 
-#ifdef PNG_READ_SUPPORTED
 void PNGAPI
 png_chunk_error(png_structp png_ptr, png_const_charp error_message)
 {
    char msg[18+64];
    if (png_ptr == NULL)
      png_error(png_ptr, error_message);
    else
    {
      png_format_buffer(png_ptr, msg, error_message);
      png_error(png_ptr, msg);
    }
 }
-#endif /* PNG_READ_SUPPORTED */
-#endif /* !defined(PNG_NO_WARNINGS) || !defined(PNG_NO_ERROR_TEXT) */
 
-#ifndef PNG_NO_WARNINGS
 void PNGAPI
 png_chunk_warning(png_structp png_ptr, png_const_charp warning_message)
 {
    char msg[18+64];
    if (png_ptr == NULL)
      png_warning(png_ptr, warning_message);
    else
    {
      png_format_buffer(png_ptr, msg, warning_message);
      png_warning(png_ptr, msg);
    }
 }
-#endif /* PNG_NO_WARNINGS */
-
 
 /* This is the default error handling function.  Note that replacements for
  * this function MUST NOT RETURN, or the program will likely crash.  This
  * function is used by default, or if the program supplies NULL for the
  * error function pointer in png_set_error_fn().
  */
 static void /* PRIVATE */
 png_default_error(png_structp png_ptr, png_const_charp error_message)
@@ -237,32 +212,33 @@ png_default_error(png_structp png_ptr, p
 #endif
 
 #ifdef PNG_SETJMP_SUPPORTED
    if (png_ptr)
    {
 #  ifdef USE_FAR_KEYWORD
    {
       jmp_buf jmpbuf;
-      png_memcpy(jmpbuf, png_ptr->jmpbuf, png_sizeof(jmp_buf));
+      png_memcpy(jmpbuf,png_ptr->jmpbuf,png_sizeof(jmp_buf));
       longjmp(jmpbuf, 1);
    }
 #  else
    longjmp(png_ptr->jmpbuf, 1);
 #  endif
    }
 #else
    PNG_ABORT();
 #endif
 #ifdef PNG_NO_CONSOLE_IO
-   error_message = error_message; /* make compiler happy */
+   /* make compiler happy */ ;
+   if (&error_message != NULL)
+      return;
 #endif
 }
 
-#ifndef PNG_NO_WARNINGS
 /* This function is called when there is a warning, but the library thinks
  * it can continue anyway.  Replacement functions don't have to do anything
  * here if you don't want them to.  In the default configuration, png_ptr is
  * not used, but it is passed in case it may be useful.
  */
 static void /* PRIVATE */
 png_default_warning(png_structp png_ptr, png_const_charp warning_message)
 {
@@ -286,21 +262,24 @@ png_default_warning(png_structp png_ptr,
      }
      else
        fprintf(stderr, "libpng warning: %s\n", warning_message);
    }
    else
 #  endif
      fprintf(stderr, "libpng warning: %s\n", warning_message);
 #else
-   warning_message = warning_message; /* make compiler happy */
+   /* make compiler happy */ ;
+   if (warning_message)
+     return;
 #endif
-   png_ptr = png_ptr; /* make compiler happy */
+   /* make compiler happy */ ;
+   if (png_ptr)
+      return;
 }
-#endif /* PNG_NO_WARNINGS */
 
 /* This function is called when the application wants to use another method
  * of handling errors and warnings.  Note that the error function MUST NOT
  * return to the calling routine or serious problems will occur.  The return
  * method used in the default routine calls longjmp(png_ptr->jmpbuf, 1)
  */
 void PNGAPI
 png_set_error_fn(png_structp png_ptr, png_voidp error_ptr,
--- a/modules/libimg/png/pnggccrd.c
+++ b/modules/libimg/png/pnggccrd.c
@@ -1,1 +1,5420 @@
-/* pnggccrd.c was removed from libpng-1.2.20. */
+
+/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
+ *
+ * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
+ *
+ *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
+ *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
+ *     for Intel's performance analysis of the MMX vs. non-MMX code.
+ *
+ * Last changed in libpng 1.2.15 January 5, 2007
+ * For conditions of distribution and use, see copyright notice in png.h
+ * Copyright (c) 1998-2007 Glenn Randers-Pehrson
+ * Copyright (c) 1998, Intel Corporation
+ *
+ * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
+ * Interface to libpng contributed by Gilles Vollant, 1999.
+ * GNU C port by Greg Roelofs, 1999-2001.
+ *
+ * Lines 2350-4300 converted in place with intel2gas 1.3.1:
+ *
+ *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
+ *
+ * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
+ *
+ * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
+ *        is required to assemble the newer MMX instructions such as movq.
+ *        For djgpp, see
+ *
+ *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
+ *
+ *        (or a later version in the same directory).  For Linux, check your
+ *        distribution's web site(s) or try these links:
+ *
+ *           http://rufus.w3.org/linux/RPM/binutils.html
+ *           http://www.debian.org/Packages/stable/devel/binutils.html
+ *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
+ *             binutils.tgz
+ *
+ *        For other platforms, see the main GNU site:
+ *
+ *           ftp://ftp.gnu.org/pub/gnu/binutils/
+ *
+ *        Version 2.5.2l.15 is definitely too old...
+ */
+
+/*
+ * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
+ * =====================================
+ *
+ * 19991006:
+ *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
+ *
+ * 19991007:
+ *  - additional optimizations (possible or definite):
+ *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
+ *     - write MMX code for 48-bit case (pixel_bytes == 6)
+ *     - figure out what's up with 24-bit case (pixel_bytes == 3):
+ *        why subtract 8 from width_mmx in the pass 4/5 case?
+ *        (only width_mmx case) (near line 1606)
+ *     x [DONE] replace pixel_bytes within each block with the true
+ *        constant value (or are compilers smart enough to do that?)
+ *     - rewrite all MMX interlacing code so it's aligned with
+ *        the *beginning* of the row buffer, not the end.  This
+ *        would not only allow one to eliminate half of the memory
+ *        writes for odd passes (that is, pass == odd), it may also
+ *        eliminate some unaligned-data-access exceptions (assuming
+ *        there's a penalty for not aligning 64-bit accesses on
+ *        64-bit boundaries).  The only catch is that the "leftover"
+ *        pixel(s) at the end of the row would have to be saved,
+ *        but there are enough unused MMX registers in every case,
+ *        so this is not a problem.  A further benefit is that the
+ *        post-MMX cleanup code (C code) in at least some of the
+ *        cases could be done within the assembler block.
+ *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
+ *     inconsistent, and don't match the MMX Programmer's Reference
+ *     Manual conventions anyway.  They should be changed to
+ *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
+ *     was lowest in memory (e.g., corresponding to a left pixel)
+ *     and b7 is the byte that was highest (e.g., a right pixel).
+ *
+ * 19991016:
+ *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
+ *     want globals prefixed by underscores when referencing them--
+ *     i.e., if the variable is const4, then refer to it as const4,
+ *     not _const4.  This seems to be a djgpp-specific requirement.
+ *     Also, such variables apparently *must* be declared outside
+ *     of functions; neither static nor automatic variables work if
+ *     defined within the scope of a single function, but both
+ *     static and truly global (multi-module) variables work fine.
+ *
+ * 19991023:
+ *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
+ *  - switched from string-concatenation-with-macros to cleaner method of
+ *     renaming global variables for djgpp--i.e., always use prefixes in
+ *     inlined assembler code (== strings) and conditionally rename the
+ *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
+ *
+ * 19991024:
+ *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
+ *     This one was severely weird:  even though mmxsupport() doesn't touch
+ *     ebx (where "row" pointer was stored), it nevertheless managed to zero
+ *     the register (even in static/non-fPIC code--see below), which in turn
+ *     caused png_do_read_interlace() to return prematurely on the first row of
+ *     interlaced images (i.e., without expanding the interlaced pixels).
+ *     Inspection of the generated assembly code didn't turn up any clues,
+ *     although it did point at a minor optimization (i.e., get rid of
+ *     mmx_supported_local variable and just use eax).  Possibly the CPUID
+ *     instruction is more destructive than it looks?  (Not yet checked.)
+ *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
+ *     listings...  Apparently register spillage has to do with ebx, since
+ *     it's used to index the global offset table.  Commenting it out of the
+ *     input-reg lists in png_combine_row() eliminated compiler barfage, so
+ *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
+ *
+ * 19991107:
+ *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
+ *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
+ *
+ * 19991120:
+ *  - made "diff" variable (now "_dif") global to simplify conversion of
+ *     filtering routines (running out of regs, sigh).  "diff" is still used
+ *     in interlacing routines, however.
+ *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
+ *     macro determines which is used); original not yet tested.
+ *
+ * 20000213:
+ *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
+ *
+ * 20000319:
+ *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
+ *     pass == 4 or 5, that caused visible corruption of interlaced images
+ *
+ * 20000623:
+ *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
+ *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
+ *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
+ *     Chuck Wilson supplied a patch involving dummy output registers.  See
+ *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
+ *     for the original (anonymous) SourceForge bug report.
+ *
+ * 20000706:
+ *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
+ *       pnggccrd.c: In function `png_combine_row':
+ *       pnggccrd.c:525: more than 10 operands in `asm'
+ *       pnggccrd.c:669: more than 10 operands in `asm'
+ *       pnggccrd.c:828: more than 10 operands in `asm'
+ *       pnggccrd.c:994: more than 10 operands in `asm'
+ *       pnggccrd.c:1177: more than 10 operands in `asm'
+ *     They are all the same problem and can be worked around by using the
+ *     global _unmask variable unconditionally, not just in the -fPIC case.
+ *     Reportedly earlier versions of gcc also have the problem with more than
+ *     10 operands; they just don't report it.  Much strangeness ensues, etc.
+ *
+ * 20000729:
+ *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
+ *     MMX routine); began converting png_read_filter_row_mmx_sub()
+ *  - to finish remaining sections:
+ *     - clean up indentation and comments
+ *     - preload local variables
+ *     - add output and input regs (order of former determines numerical
+ *        mapping of latter)
+ *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
+ *     - remove "$" from addressing of Shift and Mask variables [20000823]
+ *
+ * 20000731:
+ *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
+ *
+ * 20000822:
+ *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
+ *     shared-library (-fPIC) version!  Code works just fine as part of static
+ *     library.  Damn damn damn damn damn, should have tested that sooner.
+ *     ebx is getting clobbered again (explicitly this time); need to save it
+ *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
+ *
+ * 20000823:
+ *  - first section was trickiest; all remaining sections have ebx -> edx now.
+ *     (-fPIC works again.)  Also added missing underscores to various Shift*
+ *     and *Mask* globals and got rid of leading "$" signs.
+ *
+ * 20000826:
+ *  - added visual separators to help navigate microscopic printed copies
+ *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
+ *     on png_read_filter_row_mmx_avg()
+ *
+ * 20000828:
+ *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
+ *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
+ *     cleaned up/shortened in either routine, but functionality is complete
+ *     and seems to be working fine.
+ *
+ * 20000829:
+ *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
+ *     as an input reg (with dummy output variables, etc.), then it *cannot*
+ *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
+ *     is simple enough...
+ *
+ * 20000914:
+ *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
+ *     correctly (but 48-bit RGB just fine)
+ *
+ * 20000916:
+ *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
+ *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
+ *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
+ *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
+ *
+ * 20010101:
+ *  - added new png_init_mmx_flags() function (here only because it needs to
+ *     call mmxsupport(), which should probably become global png_mmxsupport());
+ *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
+ *
+ * 20010103:
+ *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
+ *     and made it public; moved png_init_mmx_flags() to png.c as internal func
+ *
+ * 20010104:
+ *  - removed dependency on png_read_filter_row_c() (C code already duplicated
+ *     within MMX version of png_read_filter_row()) so no longer necessary to
+ *     compile it into pngrutil.o
+ *
+ * 20010310:
+ *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
+ *
+ * 20020304:
+ *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
+ *
+ * 20040724:
+ *   - more tinkering with clobber list at lines 4529 and 5033, to get
+ *     it to compile on gcc-3.4.
+ *
+ * STILL TO DO:
+ *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
+ *     - write MMX code for 48-bit case (pixel_bytes == 6)
+ *     - figure out what's up with 24-bit case (pixel_bytes == 3):
+ *        why subtract 8 from width_mmx in the pass 4/5 case?
+ *        (only width_mmx case) (near line 1606)
+ *     - rewrite all MMX interlacing code so it's aligned with beginning
+ *        of the row buffer, not the end (see 19991007 for details)
+ *     x pick one version of mmxsupport() and get rid of the other
+ *     - add error messages to any remaining bogus default cases
+ *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
+ *     x add support for runtime enable/disable/query of various MMX routines
+ */
+
+#define PNG_INTERNAL
+#include "png.h"
+
+#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
+
+int PNGAPI png_mmx_support(void);
+
+#ifdef PNG_USE_LOCAL_ARRAYS
+static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
+static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
+static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
+#endif
+
+#if defined(PNG_MMX_CODE_SUPPORTED)
+/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
+ * so define them without: */
+#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
+    defined(__OS2__)
+#  define _mmx_supported  mmx_supported
+#  define _const4         const4
+#  define _const6         const6
+#  define _mask8_0        mask8_0
+#  define _mask16_1       mask16_1
+#  define _mask16_0       mask16_0
+#  define _mask24_2       mask24_2
+#  define _mask24_1       mask24_1
+#  define _mask24_0       mask24_0
+#  define _mask32_3       mask32_3
+#  define _mask32_2       mask32_2
+#  define _mask32_1       mask32_1
+#  define _mask32_0       mask32_0
+#  define _mask48_5       mask48_5
+#  define _mask48_4       mask48_4
+#  define _mask48_3       mask48_3
+#  define _mask48_2       mask48_2
+#  define _mask48_1       mask48_1
+#  define _mask48_0       mask48_0
+#  define _LBCarryMask    LBCarryMask
+#  define _HBClearMask    HBClearMask
+#  define _ActiveMask     ActiveMask
+#  define _ActiveMask2    ActiveMask2
+#  define _ActiveMaskEnd  ActiveMaskEnd
+#  define _ShiftBpp       ShiftBpp
+#  define _ShiftRem       ShiftRem
+#ifdef PNG_THREAD_UNSAFE_OK
+#  define _unmask         unmask
+#  define _FullLength     FullLength
+#  define _MMXLength      MMXLength
+#  define _dif            dif
+#  define _patemp         patemp
+#  define _pbtemp         pbtemp
+#  define _pctemp         pctemp
+#endif
+#endif
+
+
+/* These constants are used in the inlined MMX assembly code.
+   Ignore gcc's "At top level: defined but not used" warnings. */
+
+/* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
+ *  since that case uses the %ebx register for indexing the Global Offset Table
+ *  and there were no other registers available.  But gcc 2.95 and later emit
+ *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
+ *  in the non-PIC case, so we'll just use the global unconditionally now.
+ */
+#ifdef PNG_THREAD_UNSAFE_OK
+static int _unmask;
+#endif
+
+static unsigned long long _mask8_0  = 0x0102040810204080LL;
+
+static unsigned long long _mask16_1 = 0x0101020204040808LL;
+static unsigned long long _mask16_0 = 0x1010202040408080LL;
+
+static unsigned long long _mask24_2 = 0x0101010202020404LL;
+static unsigned long long _mask24_1 = 0x0408080810101020LL;
+static unsigned long long _mask24_0 = 0x2020404040808080LL;
+
+static unsigned long long _mask32_3 = 0x0101010102020202LL;
+static unsigned long long _mask32_2 = 0x0404040408080808LL;
+static unsigned long long _mask32_1 = 0x1010101020202020LL;
+static unsigned long long _mask32_0 = 0x4040404080808080LL;
+
+static unsigned long long _mask48_5 = 0x0101010101010202LL;
+static unsigned long long _mask48_4 = 0x0202020204040404LL;
+static unsigned long long _mask48_3 = 0x0404080808080808LL;
+static unsigned long long _mask48_2 = 0x1010101010102020LL;
+static unsigned long long _mask48_1 = 0x2020202040404040LL;
+static unsigned long long _mask48_0 = 0x4040808080808080LL;
+
+static unsigned long long _const4   = 0x0000000000FFFFFFLL;
+//static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
+static unsigned long long _const6   = 0x00000000000000FFLL;
+
+// These are used in the row-filter routines and should/would be local
+//  variables if not for gcc addressing limitations.
+// WARNING: Their presence probably defeats the thread safety of libpng.
+
+#ifdef PNG_THREAD_UNSAFE_OK
+static png_uint_32  _FullLength;
+static png_uint_32  _MMXLength;
+static int          _dif;
+static int          _patemp; // temp variables for Paeth routine
+static int          _pbtemp;
+static int          _pctemp;
+#endif
+
+void /* PRIVATE */
+png_squelch_warnings(void)
+{
+#ifdef PNG_THREAD_UNSAFE_OK
+   _dif = _dif;
+   _patemp = _patemp;
+   _pbtemp = _pbtemp;
+   _pctemp = _pctemp;
+   _MMXLength = _MMXLength;
+#endif
+   _const4  = _const4;
+   _const6  = _const6;
+   _mask8_0  = _mask8_0;
+   _mask16_1 = _mask16_1;
+   _mask16_0 = _mask16_0;
+   _mask24_2 = _mask24_2;
+   _mask24_1 = _mask24_1;
+   _mask24_0 = _mask24_0;
+   _mask32_3 = _mask32_3;
+   _mask32_2 = _mask32_2;
+   _mask32_1 = _mask32_1;
+   _mask32_0 = _mask32_0;
+   _mask48_5 = _mask48_5;
+   _mask48_4 = _mask48_4;
+   _mask48_3 = _mask48_3;
+   _mask48_2 = _mask48_2;
+   _mask48_1 = _mask48_1;
+   _mask48_0 = _mask48_0;
+}
+#endif /* PNG_MMX_CODE_SUPPORTED */
+
+
+static int _mmx_supported = 2;
+
+/*===========================================================================*/
+/*                                                                           */
+/*                       P N G _ C O M B I N E _ R O W                       */
+/*                                                                           */
+/*===========================================================================*/
+
+#if defined(PNG_HAVE_MMX_COMBINE_ROW)
+
+#define BPP2  2
+#define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
+#define BPP4  4
+#define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
+#define BPP8  8
+
+/* Combines the row recently read in with the previous row.
+   This routine takes care of alpha and transparency if requested.
+   This routine also handles the two methods of progressive display
+   of interlaced images, depending on the mask value.
+   The mask value describes which pixels are to be combined with
+   the row.  The pattern always repeats every 8 pixels, so just 8
+   bits are needed.  A one indicates the pixel is to be combined; a
+   zero indicates the pixel is to be skipped.  This is in addition
+   to any alpha or transparency value associated with the pixel.
+   If you want all pixels to be combined, pass 0xff (255) in mask. */
+
+/* Use this routine for the x86 platform - it uses a faster MMX routine
+   if the machine supports MMX. */
+
+void /* PRIVATE */
+png_combine_row(png_structp png_ptr, png_bytep row, int mask)
+{
+   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
+
+#if defined(PNG_MMX_CODE_SUPPORTED)
+   if (_mmx_supported == 2) {
+#if !defined(PNG_1_0_X)
+       /* this should have happened in png_init_mmx_flags() already */
+       png_warning(png_ptr, "asm_flags may not have been initialized");
+#endif
+       png_mmx_support();
+   }
+#endif
+
+   if (mask == 0xff)
+   {
+      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
+      png_memcpy(row, png_ptr->row_buf + 1,
+       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
+   }
+   else   /* (png_combine_row() is never called with mask == 0) */
+   {
+      switch (png_ptr->row_info.pixel_depth)
+      {
+         case 1:        /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep sp;
+            png_bytep dp;
+            int s_inc, s_start, s_end;
+            int m;
+            int shift;
+            png_uint_32 i;
+
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+                s_start = 0;
+                s_end = 7;
+                s_inc = 1;
+            }
+            else
+#endif
+            {
+                s_start = 7;
+                s_end = 0;
+                s_inc = -1;
+            }
+
+            shift = s_start;
+
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  int value;
+
+                  value = (*sp >> shift) & 0x1;
+                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
+         }
+
+         case 2:        /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep sp;
+            png_bytep dp;
+            int s_start, s_end, s_inc;
+            int m;
+            int shift;
+            png_uint_32 i;
+            int value;
+
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+               s_start = 0;
+               s_end = 6;
+               s_inc = 2;
+            }
+            else
+#endif
+            {
+               s_start = 6;
+               s_end = 0;
+               s_inc = -2;
+            }
+
+            shift = s_start;
+
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  value = (*sp >> shift) & 0x3;
+                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
+         }
+
+         case 4:        /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep sp;
+            png_bytep dp;
+            int s_start, s_end, s_inc;
+            int m;
+            int shift;
+            png_uint_32 i;
+            int value;
+
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+               s_start = 0;
+               s_end = 4;
+               s_inc = 4;
+            }
+            else
+#endif
+            {
+               s_start = 4;
+               s_end = 0;
+               s_inc = -4;
+            }
+            shift = s_start;
+
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  value = (*sp >> shift) & 0xf;
+                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
+         }
+
+         case 8:        /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               png_uint_32 len;
+               int diff;
+               int dummy_value_a;   // fix 'forbidden register spilled' error
+               int dummy_value_d;
+               int dummy_value_c;
+               int dummy_value_S;
+               int dummy_value_D;
+               _unmask = ~mask;            // global variable for -fPIC version
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               len  = png_ptr->width &~7;  // reduce to multiple of 8
+               diff = (int) (png_ptr->width & 7);  // amount lost
+
+               __asm__ __volatile__ (
+                  "movd      _unmask, %%mm7  \n\t" // load bit pattern
+                  "psubb     %%mm6, %%mm6    \n\t" // zero mm6
+                  "punpcklbw %%mm7, %%mm7    \n\t"
+                  "punpcklwd %%mm7, %%mm7    \n\t"
+                  "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
+
+                  "movq      _mask8_0, %%mm0 \n\t"
+                  "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
+                  "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
+
+// preload        "movl      len, %%ecx      \n\t" // load length of line
+// preload        "movl      srcptr, %%esi   \n\t" // load source
+// preload        "movl      dstptr, %%edi   \n\t" // load dest
+
+                  "cmpl      $0, %%ecx       \n\t" // len == 0 ?
+                  "je        mainloop8end    \n\t"
+
+                "mainloop8:                  \n\t"
+                  "movq      (%%esi), %%mm4  \n\t" // *srcptr
+                  "pand      %%mm0, %%mm4    \n\t"
+                  "movq      %%mm0, %%mm6    \n\t"
+                  "pandn     (%%edi), %%mm6  \n\t" // *dstptr
+                  "por       %%mm6, %%mm4    \n\t"
+                  "movq      %%mm4, (%%edi)  \n\t"
+                  "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
+                  "addl      $8, %%edi       \n\t"
+                  "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
+                  "ja        mainloop8       \n\t"
+
+                "mainloop8end:               \n\t"
+// preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
+                  "movl      %%eax, %%ecx    \n\t"
+                  "cmpl      $0, %%ecx       \n\t"
+                  "jz        end8            \n\t"
+// preload        "movl      mask, %%edx     \n\t"
+                  "sall      $24, %%edx      \n\t" // make low byte, high byte
+
+                "secondloop8:                \n\t"
+                  "sall      %%edx           \n\t" // move high bit to CF
+                  "jnc       skip8           \n\t" // if CF = 0
+                  "movb      (%%esi), %%al   \n\t"
+                  "movb      %%al, (%%edi)   \n\t"
+
+                "skip8:                      \n\t"
+                  "incl      %%esi           \n\t"
+                  "incl      %%edi           \n\t"
+                  "decl      %%ecx           \n\t"
+                  "jnz       secondloop8     \n\t"
+
+                "end8:                       \n\t"
+                  "EMMS                      \n\t"  // DONE
+
+                  : "=a" (dummy_value_a),           // output regs (dummy)
+                    "=d" (dummy_value_d),
+                    "=c" (dummy_value_c),
+                    "=S" (dummy_value_S),
+                    "=D" (dummy_value_D)
+
+                  : "3" (srcptr),      // esi       // input regs
+                    "4" (dstptr),      // edi
+                    "0" (diff),        // eax
+// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
+                    "2" (len),         // ecx
+                    "1" (mask)         // edx
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+                  : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
+#endif
+               );
+            }
+            else /* mmx _not supported - Use modified C routine */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               register png_uint_32 i;
+               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
+                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+               register int stride = png_pass_inc[png_ptr->pass];
+                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+               register int rep_bytes = png_pass_width[png_ptr->pass];
+                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+               int diff = (int) (png_ptr->width & 7); /* amount lost */
+               register png_uint_32 final_val = len;  /* GRR bugfix */
+
+               srcptr = png_ptr->row_buf + 1 + initial_val;
+               dstptr = row + initial_val;
+
+               for (i = initial_val; i < final_val; i += stride)
+               {
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+               if (diff)  /* number of leftover pixels:  3 for pngtest */
+               {
+                  final_val+=diff /* *BPP1 */ ;
+                  for (; i < final_val; i += stride)
+                  {
+                     if (rep_bytes > (int)(final_val-i))
+                        rep_bytes = (int)(final_val-i);
+                     png_memcpy(dstptr, srcptr, rep_bytes);
+                     srcptr += stride;
+                     dstptr += stride;
+                  }
+               }
+
+            } /* end of else (_mmx_supported) */
+
+            break;
+         }       /* end 8 bpp */
+
+         case 16:       /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               png_uint_32 len;
+               int diff;
+               int dummy_value_a;   // fix 'forbidden register spilled' error
+               int dummy_value_d;
+               int dummy_value_c;
+               int dummy_value_S;
+               int dummy_value_D;
+               _unmask = ~mask;            // global variable for -fPIC version
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               len  = png_ptr->width &~7;  // reduce to multiple of 8
+               diff = (int) (png_ptr->width & 7); // amount lost //
+
+               __asm__ __volatile__ (
+                  "movd      _unmask, %%mm7   \n\t" // load bit pattern
+                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
+                  "punpcklbw %%mm7, %%mm7     \n\t"
+                  "punpcklwd %%mm7, %%mm7     \n\t"
+                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
+
+                  "movq      _mask16_0, %%mm0 \n\t"
+                  "movq      _mask16_1, %%mm1 \n\t"
+
+                  "pand      %%mm7, %%mm0     \n\t"
+                  "pand      %%mm7, %%mm1     \n\t"
+
+                  "pcmpeqb   %%mm6, %%mm0     \n\t"
+                  "pcmpeqb   %%mm6, %%mm1     \n\t"
+
+// preload        "movl      len, %%ecx       \n\t" // load length of line
+// preload        "movl      srcptr, %%esi    \n\t" // load source
+// preload        "movl      dstptr, %%edi    \n\t" // load dest
+
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        mainloop16end    \n\t"
+
+                "mainloop16:                  \n\t"
+                  "movq      (%%esi), %%mm4   \n\t"
+                  "pand      %%mm0, %%mm4     \n\t"
+                  "movq      %%mm0, %%mm6     \n\t"
+                  "movq      (%%edi), %%mm7   \n\t"
+                  "pandn     %%mm7, %%mm6     \n\t"
+                  "por       %%mm6, %%mm4     \n\t"
+                  "movq      %%mm4, (%%edi)   \n\t"
+
+                  "movq      8(%%esi), %%mm5  \n\t"
+                  "pand      %%mm1, %%mm5     \n\t"
+                  "movq      %%mm1, %%mm7     \n\t"
+                  "movq      8(%%edi), %%mm6  \n\t"
+                  "pandn     %%mm6, %%mm7     \n\t"
+                  "por       %%mm7, %%mm5     \n\t"
+                  "movq      %%mm5, 8(%%edi)  \n\t"
+
+                  "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
+                  "addl      $16, %%edi       \n\t"
+                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
+                  "ja        mainloop16       \n\t"
+
+                "mainloop16end:               \n\t"
+// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
+                  "movl      %%eax, %%ecx     \n\t"
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        end16            \n\t"
+// preload        "movl      mask, %%edx      \n\t"
+                  "sall      $24, %%edx       \n\t" // make low byte, high byte
+
+                "secondloop16:                \n\t"
+                  "sall      %%edx            \n\t" // move high bit to CF
+                  "jnc       skip16           \n\t" // if CF = 0
+                  "movw      (%%esi), %%ax    \n\t"
+                  "movw      %%ax, (%%edi)    \n\t"
+
+                "skip16:                      \n\t"
+                  "addl      $2, %%esi        \n\t"
+                  "addl      $2, %%edi        \n\t"
+                  "decl      %%ecx            \n\t"
+                  "jnz       secondloop16     \n\t"
+
+                "end16:                       \n\t"
+                  "EMMS                       \n\t" // DONE
+
+                  : "=a" (dummy_value_a),           // output regs (dummy)
+                    "=c" (dummy_value_c),
+                    "=d" (dummy_value_d),
+                    "=S" (dummy_value_S),
+                    "=D" (dummy_value_D)
+
+                  : "0" (diff),        // eax       // input regs
+// was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
+                    "1" (len),         // ecx
+                    "2" (mask),        // edx
+                    "3" (srcptr),      // esi
+                    "4" (dstptr)       // edi
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+                  : "%mm0", "%mm1", "%mm4"          // clobber list
+                  , "%mm5", "%mm6", "%mm7"
+#endif
+               );
+            }
+            else /* mmx _not supported - Use modified C routine */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               register png_uint_32 i;
+               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
+                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
+                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
+                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+               int diff = (int) (png_ptr->width & 7); /* amount lost */
+               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
+
+               srcptr = png_ptr->row_buf + 1 + initial_val;
+               dstptr = row + initial_val;
+
+               for (i = initial_val; i < final_val; i += stride)
+               {
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+               if (diff)  /* number of leftover pixels:  3 for pngtest */
+               {
+                  final_val+=diff*BPP2;
+                  for (; i < final_val; i += stride)
+                  {
+                     if (rep_bytes > (int)(final_val-i))
+                        rep_bytes = (int)(final_val-i);
+                     png_memcpy(dstptr, srcptr, rep_bytes);
+                     srcptr += stride;
+                     dstptr += stride;
+                  }
+               }
+            } /* end of else (_mmx_supported) */
+
+            break;
+         }       /* end 16 bpp */
+
+         case 24:       /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               png_uint_32 len;
+               int diff;
+               int dummy_value_a;   // fix 'forbidden register spilled' error
+               int dummy_value_d;
+               int dummy_value_c;
+               int dummy_value_S;
+               int dummy_value_D;
+               _unmask = ~mask;            // global variable for -fPIC version
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               len  = png_ptr->width &~7;  // reduce to multiple of 8
+               diff = (int) (png_ptr->width & 7); // amount lost //
+
+               __asm__ __volatile__ (
+                  "movd      _unmask, %%mm7   \n\t" // load bit pattern
+                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
+                  "punpcklbw %%mm7, %%mm7     \n\t"
+                  "punpcklwd %%mm7, %%mm7     \n\t"
+                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
+
+                  "movq      _mask24_0, %%mm0 \n\t"
+                  "movq      _mask24_1, %%mm1 \n\t"
+                  "movq      _mask24_2, %%mm2 \n\t"
+
+                  "pand      %%mm7, %%mm0     \n\t"
+                  "pand      %%mm7, %%mm1     \n\t"
+                  "pand      %%mm7, %%mm2     \n\t"
+
+                  "pcmpeqb   %%mm6, %%mm0     \n\t"
+                  "pcmpeqb   %%mm6, %%mm1     \n\t"
+                  "pcmpeqb   %%mm6, %%mm2     \n\t"
+
+// preload        "movl      len, %%ecx       \n\t" // load length of line
+// preload        "movl      srcptr, %%esi    \n\t" // load source
+// preload        "movl      dstptr, %%edi    \n\t" // load dest
+
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        mainloop24end    \n\t"
+
+                "mainloop24:                  \n\t"
+                  "movq      (%%esi), %%mm4   \n\t"
+                  "pand      %%mm0, %%mm4     \n\t"
+                  "movq      %%mm0, %%mm6     \n\t"
+                  "movq      (%%edi), %%mm7   \n\t"
+                  "pandn     %%mm7, %%mm6     \n\t"
+                  "por       %%mm6, %%mm4     \n\t"
+                  "movq      %%mm4, (%%edi)   \n\t"
+
+                  "movq      8(%%esi), %%mm5  \n\t"
+                  "pand      %%mm1, %%mm5     \n\t"
+                  "movq      %%mm1, %%mm7     \n\t"
+                  "movq      8(%%edi), %%mm6  \n\t"
+                  "pandn     %%mm6, %%mm7     \n\t"
+                  "por       %%mm7, %%mm5     \n\t"
+                  "movq      %%mm5, 8(%%edi)  \n\t"
+
+                  "movq      16(%%esi), %%mm6 \n\t"
+                  "pand      %%mm2, %%mm6     \n\t"
+                  "movq      %%mm2, %%mm4     \n\t"
+                  "movq      16(%%edi), %%mm7 \n\t"
+                  "pandn     %%mm7, %%mm4     \n\t"
+                  "por       %%mm4, %%mm6     \n\t"
+                  "movq      %%mm6, 16(%%edi) \n\t"
+
+                  "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
+                  "addl      $24, %%edi       \n\t"
+                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
+
+                  "ja        mainloop24       \n\t"
+
+                "mainloop24end:               \n\t"
+// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
+                  "movl      %%eax, %%ecx     \n\t"
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        end24            \n\t"
+// preload        "movl      mask, %%edx      \n\t"
+                  "sall      $24, %%edx       \n\t" // make low byte, high byte
+
+                "secondloop24:                \n\t"
+                  "sall      %%edx            \n\t" // move high bit to CF
+                  "jnc       skip24           \n\t" // if CF = 0
+                  "movw      (%%esi), %%ax    \n\t"
+                  "movw      %%ax, (%%edi)    \n\t"
+                  "xorl      %%eax, %%eax     \n\t"
+                  "movb      2(%%esi), %%al   \n\t"
+                  "movb      %%al, 2(%%edi)   \n\t"
+
+                "skip24:                      \n\t"
+                  "addl      $3, %%esi        \n\t"
+                  "addl      $3, %%edi        \n\t"
+                  "decl      %%ecx            \n\t"
+                  "jnz       secondloop24     \n\t"
+
+                "end24:                       \n\t"
+                  "EMMS                       \n\t" // DONE
+
+                  : "=a" (dummy_value_a),           // output regs (dummy)
+                    "=d" (dummy_value_d),
+                    "=c" (dummy_value_c),
+                    "=S" (dummy_value_S),
+                    "=D" (dummy_value_D)
+
+                  : "3" (srcptr),      // esi       // input regs
+                    "4" (dstptr),      // edi
+                    "0" (diff),        // eax
+// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
+                    "2" (len),         // ecx
+                    "1" (mask)         // edx
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+                  : "%mm0", "%mm1", "%mm2"          // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+               );
+            }
+            else /* mmx _not supported - Use modified C routine */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               register png_uint_32 i;
+               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
+                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
+                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
+                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+               int diff = (int) (png_ptr->width & 7); /* amount lost */
+               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
+
+               srcptr = png_ptr->row_buf + 1 + initial_val;
+               dstptr = row + initial_val;
+
+               for (i = initial_val; i < final_val; i += stride)
+               {
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+               if (diff)  /* number of leftover pixels:  3 for pngtest */
+               {
+                  final_val+=diff*BPP3;
+                  for (; i < final_val; i += stride)
+                  {
+                     if (rep_bytes > (int)(final_val-i))
+                        rep_bytes = (int)(final_val-i);
+                     png_memcpy(dstptr, srcptr, rep_bytes);
+                     srcptr += stride;
+                     dstptr += stride;
+                  }
+               }
+            } /* end of else (_mmx_supported) */
+
+            break;
+         }       /* end 24 bpp */
+
+         case 32:       /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               png_uint_32 len;
+               int diff;
+               int dummy_value_a;   // fix 'forbidden register spilled' error
+               int dummy_value_d;
+               int dummy_value_c;
+               int dummy_value_S;
+               int dummy_value_D;
+               _unmask = ~mask;            // global variable for -fPIC version
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               len  = png_ptr->width &~7;  // reduce to multiple of 8
+               diff = (int) (png_ptr->width & 7); // amount lost //
+
+               __asm__ __volatile__ (
+                  "movd      _unmask, %%mm7   \n\t" // load bit pattern
+                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
+                  "punpcklbw %%mm7, %%mm7     \n\t"
+                  "punpcklwd %%mm7, %%mm7     \n\t"
+                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
+
+                  "movq      _mask32_0, %%mm0 \n\t"
+                  "movq      _mask32_1, %%mm1 \n\t"
+                  "movq      _mask32_2, %%mm2 \n\t"
+                  "movq      _mask32_3, %%mm3 \n\t"
+
+                  "pand      %%mm7, %%mm0     \n\t"
+                  "pand      %%mm7, %%mm1     \n\t"
+                  "pand      %%mm7, %%mm2     \n\t"
+                  "pand      %%mm7, %%mm3     \n\t"
+
+                  "pcmpeqb   %%mm6, %%mm0     \n\t"
+                  "pcmpeqb   %%mm6, %%mm1     \n\t"
+                  "pcmpeqb   %%mm6, %%mm2     \n\t"
+                  "pcmpeqb   %%mm6, %%mm3     \n\t"
+
+// preload        "movl      len, %%ecx       \n\t" // load length of line
+// preload        "movl      srcptr, %%esi    \n\t" // load source
+// preload        "movl      dstptr, %%edi    \n\t" // load dest
+
+                  "cmpl      $0, %%ecx        \n\t" // lcr
+                  "jz        mainloop32end    \n\t"
+
+                "mainloop32:                  \n\t"
+                  "movq      (%%esi), %%mm4   \n\t"
+                  "pand      %%mm0, %%mm4     \n\t"
+                  "movq      %%mm0, %%mm6     \n\t"
+                  "movq      (%%edi), %%mm7   \n\t"
+                  "pandn     %%mm7, %%mm6     \n\t"
+                  "por       %%mm6, %%mm4     \n\t"
+                  "movq      %%mm4, (%%edi)   \n\t"
+
+                  "movq      8(%%esi), %%mm5  \n\t"
+                  "pand      %%mm1, %%mm5     \n\t"
+                  "movq      %%mm1, %%mm7     \n\t"
+                  "movq      8(%%edi), %%mm6  \n\t"
+                  "pandn     %%mm6, %%mm7     \n\t"
+                  "por       %%mm7, %%mm5     \n\t"
+                  "movq      %%mm5, 8(%%edi)  \n\t"
+
+                  "movq      16(%%esi), %%mm6 \n\t"
+                  "pand      %%mm2, %%mm6     \n\t"
+                  "movq      %%mm2, %%mm4     \n\t"
+                  "movq      16(%%edi), %%mm7 \n\t"
+                  "pandn     %%mm7, %%mm4     \n\t"
+                  "por       %%mm4, %%mm6     \n\t"
+                  "movq      %%mm6, 16(%%edi) \n\t"
+
+                  "movq      24(%%esi), %%mm7 \n\t"
+                  "pand      %%mm3, %%mm7     \n\t"
+                  "movq      %%mm3, %%mm5     \n\t"
+                  "movq      24(%%edi), %%mm4 \n\t"
+                  "pandn     %%mm4, %%mm5     \n\t"
+                  "por       %%mm5, %%mm7     \n\t"
+                  "movq      %%mm7, 24(%%edi) \n\t"
+
+                  "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
+                  "addl      $32, %%edi       \n\t"
+                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
+                  "ja        mainloop32       \n\t"
+
+                "mainloop32end:               \n\t"
+// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
+                  "movl      %%eax, %%ecx     \n\t"
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        end32            \n\t"
+// preload        "movl      mask, %%edx      \n\t"
+                  "sall      $24, %%edx       \n\t" // low byte => high byte
+
+                "secondloop32:                \n\t"
+                  "sall      %%edx            \n\t" // move high bit to CF
+                  "jnc       skip32           \n\t" // if CF = 0
+                  "movl      (%%esi), %%eax   \n\t"
+                  "movl      %%eax, (%%edi)   \n\t"
+
+                "skip32:                      \n\t"
+                  "addl      $4, %%esi        \n\t"
+                  "addl      $4, %%edi        \n\t"
+                  "decl      %%ecx            \n\t"
+                  "jnz       secondloop32     \n\t"
+
+                "end32:                       \n\t"
+                  "EMMS                       \n\t" // DONE
+
+                  : "=a" (dummy_value_a),           // output regs (dummy)
+                    "=d" (dummy_value_d),
+                    "=c" (dummy_value_c),
+                    "=S" (dummy_value_S),
+                    "=D" (dummy_value_D)
+
+                  : "3" (srcptr),      // esi       // input regs
+                    "4" (dstptr),      // edi
+                    "0" (diff),        // eax
+// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
+                    "2" (len),         // ecx
+                    "1" (mask)         // edx
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+               );
+            }
+            else /* mmx _not supported - Use modified C routine */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               register png_uint_32 i;
+               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
+                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
+                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
+                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+               int diff = (int) (png_ptr->width & 7); /* amount lost */
+               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
+
+               srcptr = png_ptr->row_buf + 1 + initial_val;
+               dstptr = row + initial_val;
+
+               for (i = initial_val; i < final_val; i += stride)
+               {
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+               if (diff)  /* number of leftover pixels:  3 for pngtest */
+               {
+                  final_val+=diff*BPP4;
+                  for (; i < final_val; i += stride)
+                  {
+                     if (rep_bytes > (int)(final_val-i))
+                        rep_bytes = (int)(final_val-i);
+                     png_memcpy(dstptr, srcptr, rep_bytes);
+                     srcptr += stride;
+                     dstptr += stride;
+                  }
+               }
+            } /* end of else (_mmx_supported) */
+
+            break;
+         }       /* end 32 bpp */
+
+         case 48:       /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               png_uint_32 len;
+               int diff;
+               int dummy_value_a;   // fix 'forbidden register spilled' error
+               int dummy_value_d;
+               int dummy_value_c;
+               int dummy_value_S;
+               int dummy_value_D;
+               _unmask = ~mask;            // global variable for -fPIC version
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               len  = png_ptr->width &~7;  // reduce to multiple of 8
+               diff = (int) (png_ptr->width & 7); // amount lost //
+
+               __asm__ __volatile__ (
+                  "movd      _unmask, %%mm7   \n\t" // load bit pattern
+                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
+                  "punpcklbw %%mm7, %%mm7     \n\t"
+                  "punpcklwd %%mm7, %%mm7     \n\t"
+                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
+
+                  "movq      _mask48_0, %%mm0 \n\t"
+                  "movq      _mask48_1, %%mm1 \n\t"
+                  "movq      _mask48_2, %%mm2 \n\t"
+                  "movq      _mask48_3, %%mm3 \n\t"
+                  "movq      _mask48_4, %%mm4 \n\t"
+                  "movq      _mask48_5, %%mm5 \n\t"
+
+                  "pand      %%mm7, %%mm0     \n\t"
+                  "pand      %%mm7, %%mm1     \n\t"
+                  "pand      %%mm7, %%mm2     \n\t"
+                  "pand      %%mm7, %%mm3     \n\t"
+                  "pand      %%mm7, %%mm4     \n\t"
+                  "pand      %%mm7, %%mm5     \n\t"
+
+                  "pcmpeqb   %%mm6, %%mm0     \n\t"
+                  "pcmpeqb   %%mm6, %%mm1     \n\t"
+                  "pcmpeqb   %%mm6, %%mm2     \n\t"
+                  "pcmpeqb   %%mm6, %%mm3     \n\t"
+                  "pcmpeqb   %%mm6, %%mm4     \n\t"
+                  "pcmpeqb   %%mm6, %%mm5     \n\t"
+
+// preload        "movl      len, %%ecx       \n\t" // load length of line
+// preload        "movl      srcptr, %%esi    \n\t" // load source
+// preload        "movl      dstptr, %%edi    \n\t" // load dest
+
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        mainloop48end    \n\t"
+
+                "mainloop48:                  \n\t"
+                  "movq      (%%esi), %%mm7   \n\t"
+                  "pand      %%mm0, %%mm7     \n\t"
+                  "movq      %%mm0, %%mm6     \n\t"
+                  "pandn     (%%edi), %%mm6   \n\t"
+                  "por       %%mm6, %%mm7     \n\t"
+                  "movq      %%mm7, (%%edi)   \n\t"
+
+                  "movq      8(%%esi), %%mm6  \n\t"
+                  "pand      %%mm1, %%mm6     \n\t"
+                  "movq      %%mm1, %%mm7     \n\t"
+                  "pandn     8(%%edi), %%mm7  \n\t"
+                  "por       %%mm7, %%mm6     \n\t"
+                  "movq      %%mm6, 8(%%edi)  \n\t"
+
+                  "movq      16(%%esi), %%mm6 \n\t"
+                  "pand      %%mm2, %%mm6     \n\t"
+                  "movq      %%mm2, %%mm7     \n\t"
+                  "pandn     16(%%edi), %%mm7 \n\t"
+                  "por       %%mm7, %%mm6     \n\t"
+                  "movq      %%mm6, 16(%%edi) \n\t"
+
+                  "movq      24(%%esi), %%mm7 \n\t"
+                  "pand      %%mm3, %%mm7     \n\t"
+                  "movq      %%mm3, %%mm6     \n\t"
+                  "pandn     24(%%edi), %%mm6 \n\t"
+                  "por       %%mm6, %%mm7     \n\t"
+                  "movq      %%mm7, 24(%%edi) \n\t"
+
+                  "movq      32(%%esi), %%mm6 \n\t"
+                  "pand      %%mm4, %%mm6     \n\t"
+                  "movq      %%mm4, %%mm7     \n\t"
+                  "pandn     32(%%edi), %%mm7 \n\t"
+                  "por       %%mm7, %%mm6     \n\t"
+                  "movq      %%mm6, 32(%%edi) \n\t"
+
+                  "movq      40(%%esi), %%mm7 \n\t"
+                  "pand      %%mm5, %%mm7     \n\t"
+                  "movq      %%mm5, %%mm6     \n\t"
+                  "pandn     40(%%edi), %%mm6 \n\t"
+                  "por       %%mm6, %%mm7     \n\t"
+                  "movq      %%mm7, 40(%%edi) \n\t"
+
+                  "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
+                  "addl      $48, %%edi       \n\t"
+                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
+
+                  "ja        mainloop48       \n\t"
+
+                "mainloop48end:               \n\t"
+// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
+                  "movl      %%eax, %%ecx     \n\t"
+                  "cmpl      $0, %%ecx        \n\t"
+                  "jz        end48            \n\t"
+// preload        "movl      mask, %%edx      \n\t"
+                  "sall      $24, %%edx       \n\t" // make low byte, high byte
+
+                "secondloop48:                \n\t"
+                  "sall      %%edx            \n\t" // move high bit to CF
+                  "jnc       skip48           \n\t" // if CF = 0
+                  "movl      (%%esi), %%eax   \n\t"
+                  "movl      %%eax, (%%edi)   \n\t"
+
+                "skip48:                      \n\t"
+                  "addl      $4, %%esi        \n\t"
+                  "addl      $4, %%edi        \n\t"
+                  "decl      %%ecx            \n\t"
+                  "jnz       secondloop48     \n\t"
+
+                "end48:                       \n\t"
+                  "EMMS                       \n\t" // DONE
+
+                  : "=a" (dummy_value_a),           // output regs (dummy)
+                    "=d" (dummy_value_d),
+                    "=c" (dummy_value_c),
+                    "=S" (dummy_value_S),
+                    "=D" (dummy_value_D)
+
+                  : "3" (srcptr),      // esi       // input regs
+                    "4" (dstptr),      // edi
+                    "0" (diff),        // eax
+// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
+                    "2" (len),         // ecx
+                    "1" (mask)         // edx
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+               );
+            }
+            else /* mmx _not supported - Use modified C routine */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               register png_uint_32 i;
+               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
+                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
+                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
+                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+               int diff = (int) (png_ptr->width & 7); /* amount lost */
+               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
+
+               srcptr = png_ptr->row_buf + 1 + initial_val;
+               dstptr = row + initial_val;
+
+               for (i = initial_val; i < final_val; i += stride)
+               {
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+               if (diff)  /* number of leftover pixels:  3 for pngtest */
+               {
+                  final_val+=diff*BPP6;
+                  for (; i < final_val; i += stride)
+                  {
+                     if (rep_bytes > (int)(final_val-i))
+                        rep_bytes = (int)(final_val-i);
+                     png_memcpy(dstptr, srcptr, rep_bytes);
+                     srcptr += stride;
+                     dstptr += stride;
+                  }
+               }
+            } /* end of else (_mmx_supported) */
+
+            break;
+         }       /* end 48 bpp */
+
+         case 64:       /* png_ptr->row_info.pixel_depth */
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+            register png_uint_32 i;
+            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
+              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
+              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
+              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
+            int diff = (int) (png_ptr->width & 7); /* amount lost */
+            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
+
+            srcptr = png_ptr->row_buf + 1 + initial_val;
+            dstptr = row + initial_val;
+
+            for (i = initial_val; i < final_val; i += stride)
+            {
+               png_memcpy(dstptr, srcptr, rep_bytes);
+               srcptr += stride;
+               dstptr += stride;
+            }
+            if (diff)  /* number of leftover pixels:  3 for pngtest */
+            {
+               final_val+=diff*BPP8;
+               for (; i < final_val; i += stride)
+               {
+                  if (rep_bytes > (int)(final_val-i))
+                     rep_bytes = (int)(final_val-i);
+                  png_memcpy(dstptr, srcptr, rep_bytes);
+                  srcptr += stride;
+                  dstptr += stride;
+               }
+            }
+
+            break;
+         }       /* end 64 bpp */
+
+         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
+         {
+            /* this should never happen */
+            png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
+            break;
+         }
+      } /* end switch (png_ptr->row_info.pixel_depth) */
+
+   } /* end if (non-trivial mask) */
+
+} /* end png_combine_row() */
+
+#endif /* PNG_HAVE_MMX_COMBINE_ROW */
+
+
+
+
+/*===========================================================================*/
+/*                                                                           */
+/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
+/*                                                                           */
+/*===========================================================================*/
+
+#if defined(PNG_READ_INTERLACING_SUPPORTED)
+#if defined(PNG_HAVE_MMX_READ_INTERLACE)
+
+/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
+ * has taken place.  [GRR: what other steps come before and/or after?]
+ */
+
+void /* PRIVATE */
+png_do_read_interlace(png_structp png_ptr)
+{
+   png_row_infop row_info = &(png_ptr->row_info);
+   png_bytep row = png_ptr->row_buf + 1;
+   int pass = png_ptr->pass;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+   png_uint_32 transformations = png_ptr->transformations;
+#endif
+
+   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
+
+#if defined(PNG_MMX_CODE_SUPPORTED)
+   if (_mmx_supported == 2) {
+#if !defined(PNG_1_0_X)
+       /* this should have happened in png_init_mmx_flags() already */
+       png_warning(png_ptr, "asm_flags may not have been initialized");
+#endif
+       png_mmx_support();
+   }
+#endif
+
+   if (row != NULL && row_info != NULL)
+   {
+      png_uint_32 final_width;
+
+      final_width = row_info->width * png_pass_inc[pass];
+
+      switch (row_info->pixel_depth)
+      {
+         case 1:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_byte v;
+            png_uint_32 i;
+            int j;
+
+            sp = row + (png_size_t)((row_info->width - 1) >> 3);
+            dp = row + (png_size_t)((final_width - 1) >> 3);
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (int)((row_info->width + 7) & 7);
+               dshift = (int)((final_width + 7) & 7);
+               s_start = 7;
+               s_end = 0;
+               s_inc = -1;
+            }
+            else
+#endif
+            {
+               sshift = 7 - (int)((row_info->width + 7) & 7);
+               dshift = 7 - (int)((final_width + 7) & 7);
+               s_start = 0;
+               s_end = 7;
+               s_inc = 1;
+            }
+
+            for (i = row_info->width; i; i--)
+            {
+               v = (png_byte)((*sp >> sshift) & 0x1);
+               for (j = 0; j < png_pass_inc[pass]; j++)
+               {
+                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
+            }
+            break;
+         }
+
+         case 2:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_uint_32 i;
+
+            sp = row + (png_size_t)((row_info->width - 1) >> 2);
+            dp = row + (png_size_t)((final_width - 1) >> 2);
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
+               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
+               s_start = 6;
+               s_end = 0;
+               s_inc = -2;
+            }
+            else
+#endif
+            {
+               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
+               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
+               s_start = 0;
+               s_end = 6;
+               s_inc = 2;
+            }
+
+            for (i = row_info->width; i; i--)
+            {
+               png_byte v;
+               int j;
+
+               v = (png_byte)((*sp >> sshift) & 0x3);
+               for (j = 0; j < png_pass_inc[pass]; j++)
+               {
+                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
+            }
+            break;
+         }
+
+         case 4:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_uint_32 i;
+
+            sp = row + (png_size_t)((row_info->width - 1) >> 1);
+            dp = row + (png_size_t)((final_width - 1) >> 1);
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
+               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
+               s_start = 4;
+               s_end = 0;
+               s_inc = -4;
+            }
+            else
+#endif
+            {
+               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
+               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
+               s_start = 0;
+               s_end = 4;
+               s_inc = 4;
+            }
+
+            for (i = row_info->width; i; i--)
+            {
+               png_byte v;
+               int j;
+
+               v = (png_byte)((*sp >> sshift) & 0xf);
+               for (j = 0; j < png_pass_inc[pass]; j++)
+               {
+                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
+            }
+            break;
+         }
+
+       /*====================================================================*/
+
+         default: /* 8-bit or larger (this is where the routine is modified) */
+         {
+#if 0
+//          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
+//          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
+//          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
+//          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
+#endif
+            png_bytep sptr, dp;
+            png_uint_32 i;
+            png_size_t pixel_bytes;
+            int width = (int)row_info->width;
+
+            pixel_bytes = (row_info->pixel_depth >> 3);
+
+            /* point sptr at the last pixel in the pre-expanded row: */
+            sptr = row + (width - 1) * pixel_bytes;
+
+            /* point dp at the last pixel position in the expanded row: */
+            dp = row + (final_width - 1) * pixel_bytes;
+
+            /* New code by Nirav Chhatrapati - Intel Corporation */
+
+#if defined(PNG_MMX_CODE_SUPPORTED)
+#if !defined(PNG_1_0_X)
+            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
+                /* && _mmx_supported */ )
+#else
+            if (_mmx_supported)
+#endif
+            {
+               //--------------------------------------------------------------
+               if (pixel_bytes == 3)
+               {
+                  if (((pass == 0) || (pass == 1)) && width)
+                  {
+                     int dummy_value_c;   // fix 'forbidden register spilled'
+                     int dummy_value_S;
+                     int dummy_value_D;
+                     int dummy_value_a;
+
+                     __asm__ __volatile__ (
+                        "subl $21, %%edi         \n\t"
+                                     // (png_pass_inc[pass] - 1)*pixel_bytes
+
+                     ".loop3_pass0:              \n\t"
+                        "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
+                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
+                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
+                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
+                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
+                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
+                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
+                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
+                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
+                        "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
+                        "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
+                        "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
+                        "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
+                        "movq %%mm4, 16(%%edi)   \n\t"
+                        "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
+                        "movq %%mm3, 8(%%edi)    \n\t"
+                        "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
+                        "subl $3, %%esi          \n\t"
+                        "movq %%mm0, (%%edi)     \n\t"
+                        "subl $24, %%edi         \n\t"
+                        "decl %%ecx              \n\t"
+                        "jnz .loop3_pass0        \n\t"
+                        "EMMS                    \n\t" // DONE
+
+                        : "=c" (dummy_value_c),        // output regs (dummy)
+                          "=S" (dummy_value_S),
+                          "=D" (dummy_value_D),
+                          "=a" (dummy_value_a)
+
+
+                        : "1" (sptr),      // esi      // input regs
+                          "2" (dp),        // edi
+                          "0" (width),     // ecx
+                          "3" (&_const4)  // %1(?)  (0x0000000000FFFFFFLL)
+
+#if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                        : "%mm0", "%mm1", "%mm2"       // clobber list
+                        , "%mm3", "%mm4"
+#endif
+                     );
+                  }
+                  else if (((pass == 2) || (pass == 3)) && width)
+                  {
+                     int dummy_value_c;   // fix 'forbidden register spilled'
+                     int dummy_value_S;
+                     int dummy_value_D;
+                     int dummy_value_a;
+
+                     __asm__ __volatile__ (
+                        "subl $9, %%edi          \n\t"
+                                     // (png_pass_inc[pass] - 1)*pixel_bytes
+
+                     ".loop3_pass2:              \n\t"
+                        "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
+                        "pand (%3), %%mm0     \n\t" // z z z z z 2 1 0
+                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
+                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
+                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
+                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
+                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
+                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
+                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
+                        "movq %%mm0, 4(%%edi)    \n\t"
+                        "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
+                        "subl $3, %%esi          \n\t"
+                        "movd %%mm0, (%%edi)     \n\t"
+                        "subl $12, %%edi         \n\t"
+                        "decl %%ecx              \n\t"
+                        "jnz .loop3_pass2        \n\t"
+                        "EMMS                    \n\t" // DONE
+
+                        : "=c" (dummy_value_c),        // output regs (dummy)
+                          "=S" (dummy_value_S),
+                          "=D" (dummy_value_D),
+                          "=a" (dummy_value_a)
+
+                        : "1" (sptr),      // esi      // input regs
+                          "2" (dp),        // edi
+                          "0" (width),     // ecx
+                          "3" (&_const4)  // (0x0000000000FFFFFFLL)
+
+#if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                        : "%mm0", "%mm1", "%mm2"       // clobber list
+#endif
+                     );
+                  }
+                  else if (width) /* && ((pass == 4) || (pass == 5)) */
+                  {
+                     int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
+                     if (width_mmx < 0)
+                         width_mmx = 0;
+                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
+                     if (width_mmx)
+                     {
+                        // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+                        // sptr points at last pixel in pre-expanded row
+                        // dp points at last pixel position in expanded row
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+                        int dummy_value_a;
+                        int dummy_value_d;
+
+                        __asm__ __volatile__ (
+                           "subl $3, %%esi          \n\t"
+                           "subl $9, %%edi          \n\t"
+                                        // (png_pass_inc[pass] + 1)*pixel_bytes
+
+                        ".loop3_pass4:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
+                           "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
+                           "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
+                           "pand (%3), %%mm1          \n\t" // z z z z z 2 1 0
+                           "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
+                           "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
+                           "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
+                           "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
+                           "pand (%4), %%mm3     \n\t" // z z z z z z z 5
+                           "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
+                           "subl $6, %%esi          \n\t"
+                           "movd %%mm2, 8(%%edi)    \n\t"
+                           "subl $12, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop3_pass4        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D),
+                             "=a" (dummy_value_a),
+                             "=d" (dummy_value_d)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx), // ecx
+                             "3" (&_const4), // 0x0000000000FFFFFFLL
+                             "4" (&_const6)  // 0x00000000000000FFLL
+
+#if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+                           , "%mm2", "%mm3"
+#endif
+                        );
+                     }
+
+                     sptr -= width_mmx*3;
+                     dp -= width_mmx*6;
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+
+                        png_memcpy(v, sptr, 3);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           png_memcpy(dp, v, 3);
+                           dp -= 3;
+                        }
+                        sptr -= 3;
+                     }
+                  }
+               } /* end of pixel_bytes == 3 */
+
+               //--------------------------------------------------------------
+               else if (pixel_bytes == 1)
+               {
+                  if (((pass == 0) || (pass == 1)) && width)
+                  {
+                     int width_mmx = ((width >> 2) << 2);
+                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $3, %%esi          \n\t"
+                           "subl $31, %%edi         \n\t"
+
+                        ".loop1_pass0:              \n\t"
+                           "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
+                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
+                           "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
+                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
+                           "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
+                           "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
+                           "movq %%mm3, 8(%%edi)    \n\t"
+                           "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
+                           "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
+                           "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
+                           "movq %%mm2, 16(%%edi)   \n\t"
+                           "subl $4, %%esi          \n\t"
+                           "movq %%mm4, 24(%%edi)   \n\t"
+                           "subl $32, %%edi         \n\t"
+                           "subl $4, %%ecx          \n\t"
+                           "jnz .loop1_pass0        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1", "%mm2"       // clobber list
+                           , "%mm3", "%mm4"
+#endif
+                        );
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*8;
+                     for (i = width; i; i--)
+                     {
+                        int j;
+
+                       /* I simplified this part in version 1.0.4e
+                        * here and in several other instances where
+                        * pixel_bytes == 1  -- GR-P
+                        *
+                        * Original code:
+                        *
+                        * png_byte v[8];
+                        * png_memcpy(v, sptr, pixel_bytes);
+                        * for (j = 0; j < png_pass_inc[pass]; j++)
+                        * {
+                        *    png_memcpy(dp, v, pixel_bytes);
+                        *    dp -= pixel_bytes;
+                        * }
+                        * sptr -= pixel_bytes;
+                        *
+                        * Replacement code is in the next three lines:
+                        */
+
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           *dp-- = *sptr;
+                        }
+                        --sptr;
+                     }
+                  }
+                  else if (((pass == 2) || (pass == 3)) && width)
+                  {
+                     int width_mmx = ((width >> 2) << 2);
+                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $3, %%esi          \n\t"
+                           "subl $15, %%edi         \n\t"
+
+                        ".loop1_pass2:              \n\t"
+                           "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
+                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
+                           "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
+                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
+                           "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $4, %%esi          \n\t"
+                           "movq %%mm1, 8(%%edi)    \n\t"
+                           "subl $16, %%edi         \n\t"
+                           "subl $4, %%ecx          \n\t"
+                           "jnz .loop1_pass2        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*4;
+                     for (i = width; i; i--)
+                     {
+                        int j;
+
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           *dp-- = *sptr;
+                        }
+                        --sptr;
+                     }
+                  }
+                  else if (width)  /* && ((pass == 4) || (pass == 5)) */
+                  {
+                     int width_mmx = ((width >> 3) << 3);
+                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $7, %%esi          \n\t"
+                           "subl $15, %%edi         \n\t"
+
+                        ".loop1_pass4:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
+                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
+                           "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
+                           "movq %%mm1, 8(%%edi)    \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $16, %%edi         \n\t"
+                           "subl $8, %%ecx          \n\t"
+                           "jnz .loop1_pass4        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (none)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*2;
+                     for (i = width; i; i--)
+                     {
+                        int j;
+
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           *dp-- = *sptr;
+                        }
+                        --sptr;
+                     }
+                  }
+               } /* end of pixel_bytes == 1 */
+
+               //--------------------------------------------------------------
+               else if (pixel_bytes == 2)
+               {
+                  if (((pass == 0) || (pass == 1)) && width)
+                  {
+                     int width_mmx = ((width >> 1) << 1);
+                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $2, %%esi          \n\t"
+                           "subl $30, %%edi         \n\t"
+
+                        ".loop2_pass0:              \n\t"
+                           "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
+                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
+                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "movq %%mm0, 8(%%edi)    \n\t"
+                           "movq %%mm1, 16(%%edi)   \n\t"
+                           "subl $4, %%esi          \n\t"
+                           "movq %%mm1, 24(%%edi)   \n\t"
+                           "subl $32, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop2_pass0        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*2 - 2); // sign fixed
+                     dp -= (width_mmx*16 - 2);  // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 2;
+                        png_memcpy(v, sptr, 2);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 2;
+                           png_memcpy(dp, v, 2);
+                        }
+                     }
+                  }
+                  else if (((pass == 2) || (pass == 3)) && width)
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $2, %%esi          \n\t"
+                           "subl $14, %%edi         \n\t"
+
+                        ".loop2_pass2:              \n\t"
+                           "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
+                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
+                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $4, %%esi          \n\t"
+                           "movq %%mm1, 8(%%edi)    \n\t"
+                           "subl $16, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop2_pass2        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*2 - 2); // sign fixed
+                     dp -= (width_mmx*8 - 2);   // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 2;
+                        png_memcpy(v, sptr, 2);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 2;
+                           png_memcpy(dp, v, 2);
+                        }
+                     }
+                  }
+                  else if (width)  // pass == 4 or 5
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $2, %%esi          \n\t"
+                           "subl $6, %%edi          \n\t"
+
+                        ".loop2_pass4:              \n\t"
+                           "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
+                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
+                           "subl $4, %%esi          \n\t"
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $8, %%edi          \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop2_pass4        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0"                       // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*2 - 2); // sign fixed
+                     dp -= (width_mmx*4 - 2);   // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 2;
+                        png_memcpy(v, sptr, 2);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 2;
+                           png_memcpy(dp, v, 2);
+                        }
+                     }
+                  }
+               } /* end of pixel_bytes == 2 */
+
+               //--------------------------------------------------------------
+               else if (pixel_bytes == 4)
+               {
+                  if (((pass == 0) || (pass == 1)) && width)
+                  {
+                     int width_mmx = ((width >> 1) << 1);
+                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $4, %%esi          \n\t"
+                           "subl $60, %%edi         \n\t"
+
+                        ".loop4_pass0:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
+                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "movq %%mm0, 8(%%edi)    \n\t"
+                           "movq %%mm0, 16(%%edi)   \n\t"
+                           "movq %%mm0, 24(%%edi)   \n\t"
+                           "movq %%mm1, 32(%%edi)   \n\t"
+                           "movq %%mm1, 40(%%edi)   \n\t"
+                           "movq %%mm1, 48(%%edi)   \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "movq %%mm1, 56(%%edi)   \n\t"
+                           "subl $64, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop4_pass0        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*4 - 4); // sign fixed
+                     dp -= (width_mmx*32 - 4);  // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 4;
+                        png_memcpy(v, sptr, 4);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 4;
+                           png_memcpy(dp, v, 4);
+                        }
+                     }
+                  }
+                  else if (((pass == 2) || (pass == 3)) && width)
+                  {
+                     int width_mmx = ((width >> 1) << 1);
+                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $4, %%esi          \n\t"
+                           "subl $28, %%edi         \n\t"
+
+                        ".loop4_pass2:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
+                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "movq %%mm0, 8(%%edi)    \n\t"
+                           "movq %%mm1, 16(%%edi)   \n\t"
+                           "movq %%mm1, 24(%%edi)   \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "subl $32, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop4_pass2        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*4 - 4); // sign fixed
+                     dp -= (width_mmx*16 - 4);  // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 4;
+                        png_memcpy(v, sptr, 4);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 4;
+                           png_memcpy(dp, v, 4);
+                        }
+                     }
+                  }
+                  else if (width)  // pass == 4 or 5
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
+                     if (width_mmx)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $4, %%esi          \n\t"
+                           "subl $12, %%edi         \n\t"
+
+                        ".loop4_pass4:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
+                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
+                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "movq %%mm1, 8(%%edi)    \n\t"
+                           "subl $16, %%edi         \n\t"
+                           "subl $2, %%ecx          \n\t"
+                           "jnz .loop4_pass4        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width_mmx)  // ecx
+
+#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0", "%mm1"               // clobber list
+#endif
+                        );
+                     }
+
+                     sptr -= (width_mmx*4 - 4); // sign fixed
+                     dp -= (width_mmx*8 - 4);   // sign fixed
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= 4;
+                        png_memcpy(v, sptr, 4);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= 4;
+                           png_memcpy(dp, v, 4);
+                        }
+                     }
+                  }
+               } /* end of pixel_bytes == 4 */
+
+               //--------------------------------------------------------------
+               else if (pixel_bytes == 8)
+               {
+// GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
+                  // GRR NOTE:  no need to combine passes here!
+                  if (((pass == 0) || (pass == 1)) && width)
+                  {
+                     int dummy_value_c;  // fix 'forbidden register spilled'
+                     int dummy_value_S;
+                     int dummy_value_D;
+
+                     // source is 8-byte RRGGBBAA
+                     // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
+                     __asm__ __volatile__ (
+                        "subl $56, %%edi         \n\t" // start of last block
+
+                     ".loop8_pass0:              \n\t"
+                        "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                        "movq %%mm0, (%%edi)     \n\t"
+                        "movq %%mm0, 8(%%edi)    \n\t"
+                        "movq %%mm0, 16(%%edi)   \n\t"
+                        "movq %%mm0, 24(%%edi)   \n\t"
+                        "movq %%mm0, 32(%%edi)   \n\t"
+                        "movq %%mm0, 40(%%edi)   \n\t"
+                        "movq %%mm0, 48(%%edi)   \n\t"
+                        "subl $8, %%esi          \n\t"
+                        "movq %%mm0, 56(%%edi)   \n\t"
+                        "subl $64, %%edi         \n\t"
+                        "decl %%ecx              \n\t"
+                        "jnz .loop8_pass0        \n\t"
+                        "EMMS                    \n\t" // DONE
+
+                        : "=c" (dummy_value_c),        // output regs (dummy)
+                          "=S" (dummy_value_S),
+                          "=D" (dummy_value_D)
+
+                        : "1" (sptr),      // esi      // input regs
+                          "2" (dp),        // edi
+                          "0" (width)      // ecx
+
+#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                        : "%mm0"                       // clobber list
+#endif
+                     );
+                  }
+                  else if (((pass == 2) || (pass == 3)) && width)
+                  {
+                     // source is 8-byte RRGGBBAA
+                     // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
+                     // (recall that expansion is _in place_:  sptr and dp
+                     //  both point at locations within same row buffer)
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $24, %%edi         \n\t" // start of last block
+
+                        ".loop8_pass2:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "movq %%mm0, 8(%%edi)    \n\t"
+                           "movq %%mm0, 16(%%edi)   \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "movq %%mm0, 24(%%edi)   \n\t"
+                           "subl $32, %%edi         \n\t"
+                           "decl %%ecx              \n\t"
+                           "jnz .loop8_pass2        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width)      // ecx
+
+#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0"                       // clobber list
+#endif
+                        );
+                     }
+                  }
+                  else if (width)  // pass == 4 or 5
+                  {
+                     // source is 8-byte RRGGBBAA
+                     // dest is 16-byte RRGGBBAA RRGGBBAA
+                     {
+                        int dummy_value_c;  // fix 'forbidden register spilled'
+                        int dummy_value_S;
+                        int dummy_value_D;
+
+                        __asm__ __volatile__ (
+                           "subl $8, %%edi          \n\t" // start of last block
+
+                        ".loop8_pass4:              \n\t"
+                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                           "movq %%mm0, (%%edi)     \n\t"
+                           "subl $8, %%esi          \n\t"
+                           "movq %%mm0, 8(%%edi)    \n\t"
+                           "subl $16, %%edi         \n\t"
+                           "decl %%ecx              \n\t"
+                           "jnz .loop8_pass4        \n\t"
+                           "EMMS                    \n\t" // DONE
+
+                           : "=c" (dummy_value_c),        // output regs (dummy)
+                             "=S" (dummy_value_S),
+                             "=D" (dummy_value_D)
+
+                           : "1" (sptr),      // esi      // input regs
+                             "2" (dp),        // edi
+                             "0" (width)      // ecx
+
+#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
+                           : "%mm0"                       // clobber list
+#endif
+                        );
+                     }
+                  }
+
+               } /* end of pixel_bytes == 8 */
+
+               //--------------------------------------------------------------
+               else if (pixel_bytes == 6)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 6);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, 6);
+                        dp -= 6;
+                     }
+                     sptr -= 6;
+                  }
+               } /* end of pixel_bytes == 6 */
+
+               //--------------------------------------------------------------
+               else
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr-= pixel_bytes;
+                  }
+               }
+            } // end of _mmx_supported ========================================
+
+            else /* MMX not supported:  use modified C code - takes advantage
+                  *   of inlining of png_memcpy for a constant */
+                 /* GRR 19991007:  does it?  or should pixel_bytes in each
+                  *   block be replaced with immediate value (e.g., 1)? */
+                 /* GRR 19991017:  replaced with constants in each case */
+#endif /* PNG_MMX_CODE_SUPPORTED */
+            {
+               if (pixel_bytes == 1)
+               {
+                  for (i = width; i; i--)
+                  {
+                     int j;
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        *dp-- = *sptr;
+                     }
+                     --sptr;
+                  }
+               }
+               else if (pixel_bytes == 3)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 3);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, 3);
+                        dp -= 3;
+                     }
+                     sptr -= 3;
+                  }
+               }
+               else if (pixel_bytes == 2)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 2);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, 2);
+                        dp -= 2;
+                     }
+                     sptr -= 2;
+                  }
+               }
+               else if (pixel_bytes == 4)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 4);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+#ifdef PNG_DEBUG
+                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
+                        {
+                           printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
+                             row, dp, row+png_ptr->row_buf_size);
+                           printf("row_buf=%d\n",png_ptr->row_buf_size);
+                        }
+#endif
+                        png_memcpy(dp, v, 4);
+                        dp -= 4;
+                     }
+                     sptr -= 4;
+                  }
+               }
+               else if (pixel_bytes == 6)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 6);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, 6);
+                        dp -= 6;
+                     }
+                     sptr -= 6;
+                  }
+               }
+               else if (pixel_bytes == 8)
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, 8);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, 8);
+                        dp -= 8;
+                     }
+                     sptr -= 8;
+                  }
+               }
+               else     /* GRR:  should never be reached */
+               {
+                  for (i = width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+
+            } /* end if (MMX not supported) */
+            break;
+         }
+      } /* end switch (row_info->pixel_depth) */
+
+      row_info->width = final_width;
+
+      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
+   }
+
+} /* end png_do_read_interlace() */
+
+#endif /* PNG_HAVE_MMX_READ_INTERLACE */
+#endif /* PNG_READ_INTERLACING_SUPPORTED */
+
+
+
+#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
+#if defined(PNG_MMX_CODE_SUPPORTED)
+
+// These variables are utilized in the functions below.  They are declared
+// globally here to ensure alignment on 8-byte boundaries.
+
+union uAll {
+   long long use;
+   double  align;
+} _LBCarryMask = {0x0101010101010101LL},
+  _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
+  _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
+
+#ifdef PNG_THREAD_UNSAFE_OK
+//===========================================================================//
+//                                                                           //
+//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
+//                                                                           //
+//===========================================================================//
+
+// Optimized code for PNG Average filter decoder
+
+static void /* PRIVATE */
+png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
+                            png_bytep prev_row)
+{
+   int bpp;
+   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
+   int dummy_value_S;
+   int dummy_value_D;
+
+   bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
+   _FullLength  = row_info->rowbytes;       // # of bytes to filter
+
+   __asm__ __volatile__ (
+      // initialize address pointers and offset
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+//pre "movl row, %%edi             \n\t" // edi:  Avg(x)
+      "xorl %%ebx, %%ebx           \n\t" // ebx:  x
+      "movl %%edi, %%edx           \n\t"
+//pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
+
+      "xorl %%eax,%%eax            \n\t"
+
+      // Compute the Raw value for the first bpp bytes
+      //    Raw(x) = Avg(x) + (Prior(x)/2)
+   "avg_rlp:                       \n\t"
+      "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
+      "incl %%ebx                  \n\t"
+      "shrb %%al                   \n\t" // divide by 2
+      "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
+//pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
+      "cmpl %%ecx, %%ebx           \n\t"
+      "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
+      "jb avg_rlp                  \n\t" // mov does not affect flags
+
+      // get # of bytes to alignment
+      "movl %%edi, _dif            \n\t" // take start of row
+      "addl %%ebx, _dif            \n\t" // add bpp
+      "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
+      "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
+      "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
+      "jz avg_go                   \n\t" //  alignment
+
+      // fix alignment
+      // Compute the Raw value for the bytes up to the alignment boundary
+      //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
+      "xorl %%ecx, %%ecx           \n\t"
+
+   "avg_lp1:                       \n\t"
+      "xorl %%eax, %%eax           \n\t"
+      "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
+      "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
+      "addw %%cx, %%ax             \n\t"
+      "incl %%ebx                  \n\t"
+      "shrw %%ax                   \n\t" // divide by 2
+      "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+      "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
+      "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
+      "jb avg_lp1                  \n\t" // repeat until at alignment boundary
+
+   "avg_go:                        \n\t"
+      "movl _FullLength, %%eax     \n\t"
+      "movl %%eax, %%ecx           \n\t"
+      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
+      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
+      "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
+      "movl %%ecx, _MMXLength      \n\t"
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
+
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
+
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
+
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
+      // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
+      // (seems to work fine without...)
+   );
+
+   // now do the math for the rest of the row
+   switch (bpp)
+   {
+      case 3:
+      {
+         _ActiveMask.use  = 0x0000000000ffffffLL;
+         _ShiftBpp.use = 24;    // == 3 * 8
+         _ShiftRem.use = 40;    // == 64 - 24
+
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+            "movq _ActiveMask, %%mm7      \n\t"
+            "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
+            "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
+// preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4     \n\t"
+// preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                                // (correct pos. in loop below)
+         "avg_3lp:                        \n\t"
+            "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
+            "movq %%mm5, %%mm3            \n\t"
+            "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
+                                                // data
+            "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
+            "movq %%mm7, %%mm6            \n\t"
+            "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
+            "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
+            "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
+                                                // byte
+            "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
+                                                // each byte
+            // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
+                                                // LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
+                                                // where both
+                               // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
+                                                // byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                                // for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
+                                                // bytes to add to Avg
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
+                                                // Avg for each Active
+                               //  byte
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
+                                                // bytes 3-5
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
+                                                // LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
+                                                // where both
+                               // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
+                                                // byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                                // for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
+                                                // bytes to add to Avg
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
+                                                // Avg for each Active
+                               //  byte
+
+            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
+                                                // two
+                                 // bytes
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
+                              // Data only needs to be shifted once here to
+                              // get the correct x-bpp offset.
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
+                                                // LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
+                                                // where both
+                              // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
+                                                // byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                                // for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
+                                                // bytes to add to Avg
+            "addl $8, %%ecx               \n\t"
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
+                                                // Avg for each Active
+                                                // byte
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // move updated Raw(x) to use as Raw(x-bpp) for next loop
+            "cmpl _MMXLength, %%ecx       \n\t"
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
+            "jb avg_3lp                   \n\t"
+
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 3 bpp
+
+      case 6:
+      case 4:
+      //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
+      //case 5:   // GRR BOGUS
+      {
+         _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
+                                                  // appropriate inactive bytes
+         _ShiftBpp.use = bpp << 3;
+         _ShiftRem.use = 64 - _ShiftBpp.use;
+
+         __asm__ __volatile__ (
+            "movq _HBClearMask, %%mm4    \n\t"
+
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
+                                               // alignment boundary
+
+            // load _ActiveMask and clear all bytes except for 1st active group
+            "movq _ActiveMask, %%mm7     \n\t"
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "psrlq _ShiftRem, %%mm7      \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+            "movq %%mm7, %%mm6           \n\t"
+            "movq _LBCarryMask, %%mm5    \n\t"
+            "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
+                                               // group
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                          // (we correct pos. in loop below)
+         "avg_4lp:                       \n\t"
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
+            "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
+            "movq (%%esi,%%ecx,), %%mm1  \n\t"
+            // add (Prev_row/2) to average
+            "movq %%mm5, %%mm3           \n\t"
+            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
+            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
+            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
+                                               // each byte
+            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both
+                              // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
+                                               // for each Active
+                              // byte
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "addl $8, %%ecx              \n\t"
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both
+                              // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
+                                               // Avg for each Active
+                              // byte
+            "cmpl _MMXLength, %%ecx      \n\t"
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // prep Raw(x-bpp) for next loop
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "jb avg_4lp                  \n\t"
+
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 4,6 bpp
+
+      case 2:
+      {
+         _ActiveMask.use  = 0x000000000000ffffLL;
+         _ShiftBpp.use = 16;   // == 2 * 8
+         _ShiftRem.use = 48;   // == 64 - 16
+
+         __asm__ __volatile__ (
+            // load _ActiveMask
+            "movq _ActiveMask, %%mm7     \n\t"
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
+                                               // boundary
+            "movq _LBCarryMask, %%mm5    \n\t"
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                              // (we correct pos. in loop below)
+         "avg_2lp:                       \n\t"
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
+            "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
+            "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
+            // add (Prev_row/2) to average
+            "movq %%mm5, %%mm3           \n\t"
+            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
+            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
+            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "movq %%mm7, %%mm6           \n\t"
+            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
+                                               // each byte
+
+            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both
+                                               // lsb's were == 1 (only valid
+                                               // for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
+                                               // for each Active byte
+
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
+                                               // bytes 2 & 3
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both
+                                               // lsb's were == 1 (only valid
+                                               // for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
+                                               // Avg for each Active byte
+
+            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
+                                               // bytes 4 & 5
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both lsb's were == 1
+                                               // (only valid for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
+                                               // Avg for each Active byte
+
+            // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
+                                               // bytes 6 & 7
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "addl $8, %%ecx              \n\t"
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
+                                               // LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
+                                               // where both
+                                               // lsb's were == 1 (only valid
+                                               // for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+                                               // for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
+                                               // bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
+                                               // Avg for each Active byte
+
+            "cmpl _MMXLength, %%ecx      \n\t"
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // prep Raw(x-bpp) for next loop
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "jb avg_2lp                  \n\t"
+
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 2 bpp
+
+      case 1:
+      {
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+#ifdef __PIC__
+            "pushl %%ebx                 \n\t" // save Global Offset Table index
+#endif
+            "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
+                                               // boundary
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
+            "jnb avg_1end                \n\t"
+            // do Paeth decode for remaining bytes
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+            "movl %%edi, %%edx           \n\t"
+// preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+            "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
+            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
+                                               //  in loop below
+         "avg_1lp:                       \n\t"
+            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
+            "xorl %%eax, %%eax           \n\t"
+            "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
+            "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
+            "addw %%cx, %%ax             \n\t"
+            "incl %%ebx                  \n\t"
+            "shrw %%ax                   \n\t" // divide by 2
+            "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
+                                               // inc ebx
+            "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
+            "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
+                         // mov does not affect flags; -1 to offset inc ebx
+            "jb avg_1lp                  \n\t"
+
+         "avg_1end:                      \n\t"
+#ifdef __PIC__
+            "popl %%ebx                  \n\t" // Global Offset Table index
+#endif
+
+            : "=c" (dummy_value_c),            // output regs (dummy)
+              "=S" (dummy_value_S),
+              "=D" (dummy_value_D)
+
+            : "0" (bpp),       // ecx          // input regs
+              "1" (prev_row),  // esi
+              "2" (row)        // edi
+
+            : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+            , "%ebx"
+#endif
+         );
+      }
+      return;  // end 1 bpp
+
+      case 8:
+      {
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
+            "movq _LBCarryMask, %%mm5    \n\t" //            boundary
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                      // (NO NEED to correct pos. in loop below)
+
+         "avg_8lp:                       \n\t"
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
+            "movq %%mm5, %%mm3           \n\t"
+            "movq (%%esi,%%ecx,), %%mm1  \n\t"
+            "addl $8, %%ecx              \n\t"
+            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
+            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
+            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
+                                               //  where both lsb's were == 1
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
+            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
+            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
+            "jb avg_8lp                  \n\t"
+
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2"
+            , "%mm3", "%mm4", "%mm5"
+#endif
+         );
+      }
+      break;  // end 8 bpp
+
+      default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
+      {
+
+#ifdef PNG_DEBUG
+         // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
+        png_debug(1,
+        "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
+#endif
+
+#if 0
+        __asm__ __volatile__ (
+            "movq _LBCarryMask, %%mm5    \n\t"
+            // re-init address pointers and offset
+            "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
+                                               // alignment boundary
+            "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
+            "movl %%edi, %%edx           \n\t"
+            "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+            "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
+         "avg_Alp:                       \n\t"
+            "movq (%%edi,%%ebx,), %%mm0  \n\t"
+            "movq %%mm5, %%mm3           \n\t"
+            "movq (%%esi,%%ebx,), %%mm1  \n\t"
+            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
+            "movq (%%edx,%%ebx,), %%mm2  \n\t"
+            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
+            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
+                                               // where both lsb's were == 1
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
+                                               // byte
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
+                                               // byte
+            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
+                                               // each byte
+            "addl $8, %%ebx              \n\t"
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
+                                               // byte
+            "cmpl _MMXLength, %%ebx      \n\t"
+            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
+            "jb avg_Alp                  \n\t"
+
+            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+
+            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+
+            : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+         );
+#endif /* 0 - NEVER REACHED */
+      }
+      break;
+
+   } // end switch (bpp)
+
+   __asm__ __volatile__ (
+      // MMX acceleration complete; now do clean-up
+      // check if any remaining bytes left to decode
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+      "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
+//pre "movl row, %%edi             \n\t" // edi:  Avg(x)
+      "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
+      "jnb avg_end                 \n\t"
+
+      // do Avg decode for remaining bytes
+//pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+      "movl %%edi, %%edx           \n\t"
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
+      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
+
+   "avg_lp2:                       \n\t"
+      // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
+      "xorl %%eax, %%eax           \n\t"
+      "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
+      "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
+      "addw %%cx, %%ax             \n\t"
+      "incl %%ebx                  \n\t"
+      "shrw %%ax                   \n\t" // divide by 2
+      "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+      "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
+      "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
+      "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
+
+   "avg_end:                       \n\t"
+      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
+
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
+
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
+
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
+   );
+
+} /* end png_read_filter_row_mmx_avg() */
+#endif
+
+
+
+#ifdef PNG_THREAD_UNSAFE_OK
+//===========================================================================//
+//                                                                           //
+//         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
+//                                                                           //
+//===========================================================================//
+
+// Optimized code for PNG Paeth filter decoder
+
+static void /* PRIVATE */
+png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
+                              png_bytep prev_row)
+{
+   int bpp;
+   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
+   int dummy_value_S;
+   int dummy_value_D;
+
+   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+   _FullLength  = row_info->rowbytes; // # of bytes to filter
+
+   __asm__ __volatile__ (
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+      "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
+//pre "movl row, %%edi             \n\t"
+      "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
+//pre "movl prev_row, %%esi        \n\t"
+      "xorl %%eax, %%eax           \n\t"
+
+      // Compute the Raw value for the first bpp bytes
+      // Note: the formula works out to be always
+      //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
+   "paeth_rlp:                     \n\t"
+      "movb (%%edi,%%ebx,), %%al   \n\t"
+      "addb (%%esi,%%ebx,), %%al   \n\t"
+      "incl %%ebx                  \n\t"
+//pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
+      "cmpl %%ecx, %%ebx           \n\t"
+      "movb %%al, -1(%%edi,%%ebx,) \n\t"
+      "jb paeth_rlp                \n\t"
+      // get # of bytes to alignment
+      "movl %%edi, _dif            \n\t" // take start of row
+      "addl %%ebx, _dif            \n\t" // add bpp
+      "xorl %%ecx, %%ecx           \n\t"
+      "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
+                                         // boundary
+      "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
+      "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
+                                         // at alignment
+      "jz paeth_go                 \n\t"
+      // fix alignment
+
+   "paeth_lp1:                     \n\t"
+      "xorl %%eax, %%eax           \n\t"
+      // pav = p - a = (a + b - c) - a = b - c
+      "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+      "movl %%eax, _patemp         \n\t" // Save pav for later use
+      "xorl %%eax, %%eax           \n\t"
+      // pbv = p - b = (a + b - c) - b = a - c
+      "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
+      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+      "movl %%eax, %%ecx           \n\t"
+      // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+      "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
+      // pc = abs(pcv)
+      "testl $0x80000000, %%eax    \n\t"
+      "jz paeth_pca                \n\t"
+      "negl %%eax                  \n\t" // reverse sign of neg values
+
+   "paeth_pca:                     \n\t"
+      "movl %%eax, _pctemp         \n\t" // save pc for later use
+      // pb = abs(pbv)
+      "testl $0x80000000, %%ecx    \n\t"
+      "jz paeth_pba                \n\t"
+      "negl %%ecx                  \n\t" // reverse sign of neg values
+
+   "paeth_pba:                     \n\t"
+      "movl %%ecx, _pbtemp         \n\t" // save pb for later use
+      // pa = abs(pav)
+      "movl _patemp, %%eax         \n\t"
+      "testl $0x80000000, %%eax    \n\t"
+      "jz paeth_paa                \n\t"
+      "negl %%eax                  \n\t" // reverse sign of neg values
+
+   "paeth_paa:                     \n\t"
+      "movl %%eax, _patemp         \n\t" // save pa for later use
+      // test if pa <= pb
+      "cmpl %%ecx, %%eax           \n\t"
+      "jna paeth_abb               \n\t"
+      // pa > pb; now test if pb <= pc
+      "cmpl _pctemp, %%ecx         \n\t"
+      "jna paeth_bbc               \n\t"
+      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "jmp paeth_paeth             \n\t"
+
+   "paeth_bbc:                     \n\t"
+      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
+      "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
+      "jmp paeth_paeth             \n\t"
+
+   "paeth_abb:                     \n\t"
+      // pa <= pb; now test if pa <= pc
+      "cmpl _pctemp, %%eax         \n\t"
+      "jna paeth_abc               \n\t"
+      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "jmp paeth_paeth             \n\t"
+
+   "paeth_abc:                     \n\t"
+      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
+      "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
+   "paeth_paeth:                   \n\t"
+      "incl %%ebx                  \n\t"
+      "incl %%edx                  \n\t"
+      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
+      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
+      "cmpl _dif, %%ebx            \n\t"
+      "jb paeth_lp1                \n\t"
+
+   "paeth_go:                      \n\t"
+      "movl _FullLength, %%ecx     \n\t"
+      "movl %%ecx, %%eax           \n\t"
+      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
+      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
+      "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
+      "movl %%ecx, _MMXLength      \n\t"
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
+
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
+
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
+
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
+   );
+
+   // now do the math for the rest of the row
+   switch (bpp)
+   {
+      case 3:
+      {
+         _ActiveMask.use = 0x0000000000ffffffLL;
+         _ActiveMaskEnd.use = 0xffff000000000000LL;
+         _ShiftBpp.use = 24;    // == bpp(3) * 8
+         _ShiftRem.use = 40;    // == 64 - 24
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+         "paeth_3lp:                     \n\t"
+            "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
+                                               // 3 bytes
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
+                                               // 3 bytes
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "packuswb %%mm1, %%mm7       \n\t"
+            "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
+            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
+                                               // Raw(x-bpp)
+            // now do Paeth for 2nd set of bytes (3-5)
+            "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            "pxor %%mm7, %%mm7           \n\t"
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
+            //       pav + pbv = pbv + pav
+            "movq %%mm5, %%mm6           \n\t"
+            "paddw %%mm4, %%mm6          \n\t"
+
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
+            "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
+            "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
+            "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm0, %%mm5          \n\t"
+            "psubw %%mm7, %%mm4          \n\t"
+            "psubw %%mm0, %%mm5          \n\t"
+            "psubw %%mm7, %%mm4          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "pxor %%mm1, %%mm1           \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "packuswb %%mm1, %%mm7       \n\t"
+            "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
+            "pand _ActiveMask, %%mm7     \n\t"
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
+                                               // 3 bytes
+             // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq %%mm7, %%mm1           \n\t"
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
+                                    // now mm1 will be used as Raw(x-bpp)
+            // now do Paeth for 3rd, and final, set of bytes (6-7)
+            "pxor %%mm7, %%mm7           \n\t"
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            "psubw %%mm3, %%mm4          \n\t"
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "paddw %%mm5, %%mm6          \n\t"
+
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm1, %%mm1           \n\t"
+            "packuswb %%mm7, %%mm1       \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
+            "pand _ActiveMaskEnd, %%mm1  \n\t"
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
+                                                 // Raw(x)
+
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
+                                 // mm1 will be used as Raw(x-bpp) next loop
+                           // mm3 ready to be used as Prior(x-bpp) next loop
+            "jb paeth_3lp                \n\t"
+
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 3 bpp
+
+      case 6:
+      //case 7:   // GRR BOGUS
+      //case 5:   // GRR BOGUS
+      {
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+         _ActiveMask2.use = 0xffffffff00000000LL;
+         _ShiftBpp.use = bpp << 3;    // == bpp * 8
+         _ShiftRem.use = 64 - _ShiftBpp.use;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+
+         "paeth_6lp:                     \n\t"
+            // must shift to position Raw(x-bpp) data
+            "psrlq _ShiftRem, %%mm1      \n\t"
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
+            // must shift to position Prior(x-bpp) data
+            "psrlq _ShiftRem, %%mm3      \n\t"
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "packuswb %%mm1, %%mm7       \n\t"
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
+            "psrlq _ShiftRem, %%mm3      \n\t"
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
+            "movq %%mm2, %%mm6           \n\t"
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+            "psllq _ShiftBpp, %%mm6      \n\t"
+            "movq %%mm7, %%mm5           \n\t"
+            "psrlq _ShiftRem, %%mm1      \n\t"
+            "por %%mm6, %%mm3            \n\t"
+            "psllq _ShiftBpp, %%mm5      \n\t"
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "por %%mm5, %%mm1            \n\t"
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "pxor %%mm1, %%mm1           \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
+            "packuswb %%mm7, %%mm1       \n\t"
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
+                                // mm1 will be used as Raw(x-bpp) next loop
+            "jb paeth_6lp                \n\t"
+
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 6 bpp
+
+      case 4:
+      {
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
+                                     //  a=Raw(x-bpp) bytes
+         "paeth_4lp:                     \n\t"
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "packuswb %%mm1, %%mm7       \n\t"
+            "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
+            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "pxor %%mm1, %%mm1           \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
+            "packuswb %%mm7, %%mm1       \n\t"
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
+                                // mm1 will be used as Raw(x-bpp) next loop
+            "jb paeth_4lp                \n\t"
+
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 4 bpp
+
+      case 8:                          // bpp == 8
+      {
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
+                                       //  a=Raw(x-bpp) bytes
+         "paeth_8lp:                     \n\t"
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "packuswb %%mm1, %%mm7       \n\t"
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "pand _ActiveMask, %%mm7     \n\t"
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
+
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            // pav = p - a = (a + b - c) - a = b - c
+            "movq %%mm2, %%mm4           \n\t"
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movq %%mm1, %%mm5           \n\t"
+            "psubw %%mm3, %%mm4          \n\t"
+            "pxor %%mm7, %%mm7           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "movq %%mm4, %%mm6           \n\t"
+            "psubw %%mm3, %%mm5          \n\t"
+            // pa = abs(p-a) = abs(pav)
+            // pb = abs(p-b) = abs(pbv)
+            // pc = abs(p-c) = abs(pcv)
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "paddw %%mm5, %%mm6          \n\t"
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "psubw %%mm0, %%mm4          \n\t"
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
+            "psubw %%mm0, %%mm4          \n\t"
+            "psubw %%mm7, %%mm5          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "psubw %%mm7, %%mm5          \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            //  test pa <= pb
+            "movq %%mm4, %%mm7           \n\t"
+            "psubw %%mm0, %%mm6          \n\t"
+            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
+            "movq %%mm7, %%mm0           \n\t"
+            // use mm7 mask to merge pa & pb
+            "pand %%mm7, %%mm5           \n\t"
+            // use mm0 mask copy to merge a & b
+            "pand %%mm0, %%mm2           \n\t"
+            "pandn %%mm4, %%mm7          \n\t"
+            "pandn %%mm1, %%mm0          \n\t"
+            "paddw %%mm5, %%mm7          \n\t"
+            "paddw %%mm2, %%mm0          \n\t"
+            //  test  ((pa <= pb)? pa:pb) <= pc
+            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
+            "pxor %%mm1, %%mm1           \n\t"
+            "pand %%mm7, %%mm3           \n\t"
+            "pandn %%mm0, %%mm7          \n\t"
+            "pxor %%mm1, %%mm1           \n\t"
+            "paddw %%mm3, %%mm7          \n\t"
+            "pxor %%mm0, %%mm0           \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
+            "packuswb %%mm7, %%mm1       \n\t"
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
+                            // mm1 will be used as Raw(x-bpp) next loop
+            "jb paeth_8lp                \n\t"
+
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
+
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
+
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;  // end 8 bpp
+
+      case 1:                // bpp = 1
+      case 2:                // bpp = 2
+      default:               // bpp > 8
+      {
+         __asm__ __volatile__ (
+#ifdef __PIC__
+            "pushl %%ebx                 \n\t" // save Global Offset Table index
+#endif
+            "movl _dif, %%ebx            \n\t"
+            "cmpl _FullLength, %%ebx     \n\t"
+            "jnb paeth_dend              \n\t"
+
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            // do Paeth decode for remaining bytes
+            "movl %%ebx, %%edx           \n\t"
+// preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+            "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
+            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
+
+         "paeth_dlp:                     \n\t"
+            "xorl %%eax, %%eax           \n\t"
+            // pav = p - a = (a + b - c) - a = b - c
+            "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
+            "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+            "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+            "movl %%eax, _patemp         \n\t" // Save pav for later use
+            "xorl %%eax, %%eax           \n\t"
+            // pbv = p - b = (a + b - c) - b = a - c
+            "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
+            "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+            "movl %%eax, %%ecx           \n\t"
+            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+            "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
+            // pc = abs(pcv)
+            "testl $0x80000000, %%eax    \n\t"
+            "jz paeth_dpca               \n\t"
+            "negl %%eax                  \n\t" // reverse sign of neg values
+
+         "paeth_dpca:                    \n\t"
+            "movl %%eax, _pctemp         \n\t" // save pc for later use
+            // pb = abs(pbv)
+            "testl $0x80000000, %%ecx    \n\t"
+            "jz paeth_dpba               \n\t"
+            "negl %%ecx                  \n\t" // reverse sign of neg values
+
+         "paeth_dpba:                    \n\t"
+            "movl %%ecx, _pbtemp         \n\t" // save pb for later use
+            // pa = abs(pav)
+            "movl _patemp, %%eax         \n\t"
+            "testl $0x80000000, %%eax    \n\t"
+            "jz paeth_dpaa               \n\t"
+            "negl %%eax                  \n\t" // reverse sign of neg values
+
+         "paeth_dpaa:                    \n\t"
+            "movl %%eax, _patemp         \n\t" // save pa for later use
+            // test if pa <= pb
+            "cmpl %%ecx, %%eax           \n\t"
+            "jna paeth_dabb              \n\t"
+            // pa > pb; now test if pb <= pc
+            "cmpl _pctemp, %%ecx         \n\t"
+            "jna paeth_dbbc              \n\t"
+            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+            "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+            "jmp paeth_dpaeth            \n\t"
+
+         "paeth_dbbc:                    \n\t"
+            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
+            "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
+            "jmp paeth_dpaeth            \n\t"
+
+         "paeth_dabb:                    \n\t"
+            // pa <= pb; now test if pa <= pc
+            "cmpl _pctemp, %%eax         \n\t"
+            "jna paeth_dabc              \n\t"
+            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+            "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+            "jmp paeth_dpaeth            \n\t"
+
+         "paeth_dabc:                    \n\t"
+            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
+            "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
+         "paeth_dpaeth:                  \n\t"
+            "incl %%ebx                  \n\t"
+            "incl %%edx                  \n\t"
+            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
+            "addb %%cl, -1(%%edi,%%ebx,) \n\t"
+            "cmpl _FullLength, %%ebx     \n\t"
+            "jb paeth_dlp                \n\t"
+
+         "paeth_dend:                    \n\t"
+#ifdef __PIC__
+            "popl %%ebx                  \n\t" // index to Global Offset Table
+#endif
+
+            : "=c" (dummy_value_c),            // output regs (dummy)
+              "=S" (dummy_value_S),
+              "=D" (dummy_value_D)
+
+            : "0" (bpp),       // ecx          // input regs
+              "1" (prev_row),  // esi
+              "2" (row)        // edi
+
+            : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+            , "%ebx"
+#endif
+         );
+      }
+      return;                   // No need to go further with this one
+
+   } // end switch (bpp)
+
+   __asm__ __volatile__ (
+      // MMX acceleration complete; now do clean-up
+      // check if any remaining bytes left to decode
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+      "movl _MMXLength, %%ebx      \n\t"
+      "cmpl _FullLength, %%ebx     \n\t"
+      "jnb paeth_end               \n\t"
+//pre "movl row, %%edi             \n\t"
+//pre "movl prev_row, %%esi        \n\t"
+      // do Paeth decode for remaining bytes
+      "movl %%ebx, %%edx           \n\t"
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
+      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
+
+   "paeth_lp2:                     \n\t"
+      "xorl %%eax, %%eax           \n\t"
+      // pav = p - a = (a + b - c) - a = b - c
+      "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+      "movl %%eax, _patemp         \n\t" // Save pav for later use
+      "xorl %%eax, %%eax           \n\t"
+      // pbv = p - b = (a + b - c) - b = a - c
+      "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
+      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
+      "movl %%eax, %%ecx           \n\t"
+      // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+      "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
+      // pc = abs(pcv)
+      "testl $0x80000000, %%eax    \n\t"
+      "jz paeth_pca2               \n\t"
+      "negl %%eax                  \n\t" // reverse sign of neg values
+
+   "paeth_pca2:                    \n\t"
+      "movl %%eax, _pctemp         \n\t" // save pc for later use
+      // pb = abs(pbv)
+      "testl $0x80000000, %%ecx    \n\t"
+      "jz paeth_pba2               \n\t"
+      "negl %%ecx                  \n\t" // reverse sign of neg values
+
+   "paeth_pba2:                    \n\t"
+      "movl %%ecx, _pbtemp         \n\t" // save pb for later use
+      // pa = abs(pav)
+      "movl _patemp, %%eax         \n\t"
+      "testl $0x80000000, %%eax    \n\t"
+      "jz paeth_paa2               \n\t"
+      "negl %%eax                  \n\t" // reverse sign of neg values
+
+   "paeth_paa2:                    \n\t"
+      "movl %%eax, _patemp         \n\t" // save pa for later use
+      // test if pa <= pb
+      "cmpl %%ecx, %%eax           \n\t"
+      "jna paeth_abb2              \n\t"
+      // pa > pb; now test if pb <= pc
+      "cmpl _pctemp, %%ecx         \n\t"
+      "jna paeth_bbc2              \n\t"
+      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "jmp paeth_paeth2            \n\t"
+
+   "paeth_bbc2:                    \n\t"
+      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
+      "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
+      "jmp paeth_paeth2            \n\t"
+
+   "paeth_abb2:                    \n\t"
+      // pa <= pb; now test if pa <= pc
+      "cmpl _pctemp, %%eax         \n\t"
+      "jna paeth_abc2              \n\t"
+      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+      "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
+      "jmp paeth_paeth2            \n\t"
+
+   "paeth_abc2:                    \n\t"
+      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
+      "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
+   "paeth_paeth2:                  \n\t"
+      "incl %%ebx                  \n\t"
+      "incl %%edx                  \n\t"
+      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
+      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
+      "cmpl _FullLength, %%ebx     \n\t"
+      "jb paeth_lp2                \n\t"
+
+   "paeth_end:                     \n\t"
+      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
+
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
+
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
+
+      : "%eax", "%edx"                   // clobber list (no input regs!)
+#ifndef __PIC__
+      , "%ebx"
+#endif
+   );
+
+} /* end png_read_filter_row_mmx_paeth() */
+#endif
+
+
+
+
+#ifdef PNG_THREAD_UNSAFE_OK
+//===========================================================================//
+//                                                                           //
+//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
+//                                                                           //
+//===========================================================================//
+
+// Optimized code for PNG Sub filter decoder
+
+static void /* PRIVATE */
+png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
+{
+   int bpp;
+   int dummy_value_a;
+   int dummy_value_D;
+
+   bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
+   _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
+
+   __asm__ __volatile__ (
+//pre "movl row, %%edi             \n\t"
+      "movl %%edi, %%esi           \n\t" // lp = row
+//pre "movl bpp, %%eax             \n\t"
+      "addl %%eax, %%edi           \n\t" // rp = row + bpp
+//irr "xorl %%eax, %%eax           \n\t"
+      // get # of bytes to alignment
+      "movl %%edi, _dif            \n\t" // take start of row
+      "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
+                                         //  alignment boundary
+      "xorl %%ecx, %%ecx           \n\t"
+      "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
+      "subl %%edi, _dif            \n\t" // subtract from start ==> value
+      "jz sub_go                   \n\t" //  ecx at alignment
+
+   "sub_lp1:                       \n\t" // fix alignment
+      "movb (%%esi,%%ecx,), %%al   \n\t"
+      "addb %%al, (%%edi,%%ecx,)   \n\t"
+      "incl %%ecx                  \n\t"
+      "cmpl _dif, %%ecx            \n\t"
+      "jb sub_lp1                  \n\t"
+
+   "sub_go:                        \n\t"
+      "movl _FullLength, %%eax     \n\t"
+      "movl %%eax, %%edx           \n\t"
+      "subl %%ecx, %%edx           \n\t" // subtract alignment fix
+      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
+      "subl %%edx, %%eax           \n\t" // drop over bytes from length
+      "movl %%eax, _MMXLength      \n\t"
+
+      : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+        "=D" (dummy_value_D)    // 1
+
+      : "0" (bpp),              // eax    // input regs
+        "1" (row)               // edi
+
+      : "%esi", "%ecx", "%edx"            // clobber list
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+      , "%mm0", "%mm1", "%mm2", "%mm3"
+      , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+   );
+
+   // now do the math for the rest of the row
+   switch (bpp)
+   {
+      case 3:
+      {
+         _ActiveMask.use  = 0x0000ffffff000000LL;
+         _ShiftBpp.use = 24;       // == 3 * 8
+         _ShiftRem.use  = 40;      // == 64 - 24
+
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
+                                                //  active byte group
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "movq %%mm7, %%mm6            \n\t"
+            "movl _dif, %%edx             \n\t"
+            "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
+                                                //  3rd active byte group
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
+
+         "sub_3lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            // add 1st active group
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 3rd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_3lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;
+
+      case 1:
+      {
+         __asm__ __volatile__ (
+            "movl _dif, %%edx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+            "cmpl _FullLength, %%edx     \n\t"
+            "jnb sub_1end                \n\t"
+            "movl %%edi, %%esi           \n\t" // lp = row
+            "xorl %%eax, %%eax           \n\t"
+// preload  "movl bpp, %%eax             \n\t"
+            "addl %%eax, %%edi           \n\t" // rp = row + bpp
+
+         "sub_1lp:                       \n\t"
+            "movb (%%esi,%%edx,), %%al   \n\t"
+            "addb %%al, (%%edi,%%edx,)   \n\t"
+            "incl %%edx                  \n\t"
+            "cmpl _FullLength, %%edx     \n\t"
+            "jb sub_1lp                  \n\t"
+
+         "sub_1end:                      \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+         );
+      }
+      return;
+
+      case 6:
+      case 4:
+      //case 7:   // GRR BOGUS
+      //case 5:   // GRR BOGUS
+      {
+         _ShiftBpp.use = bpp << 3;
+         _ShiftRem.use = 64 - _ShiftBpp.use;
+
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movl _dif, %%edx             \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
+
+         "sub_4lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t"
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_4lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1"
+#endif
+         );
+      }
+      break;
+
+      case 2:
+      {
+         _ActiveMask.use = 0x00000000ffff0000LL;
+         _ShiftBpp.use = 16;       // == 2 * 8
+         _ShiftRem.use = 48;       // == 64 - 16
+
+         __asm__ __volatile__ (
+            "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
+                                                //  active byte group
+            "movl _dif, %%edx             \n\t"
+            "movq %%mm7, %%mm6            \n\t"
+// preload  "movl row, %%edi              \n\t"
+            "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
+                                                //  3rd active byte group
+            "movl %%edi, %%esi            \n\t" // lp = row
+            "movq %%mm6, %%mm5            \n\t"
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
+                                                //  4th active byte group
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
+
+         "sub_2lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            // add 1st active group
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 3rd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 4th active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_2lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;
+
+      case 8:
+      {
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movl _dif, %%edx             \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "movl _MMXLength, %%ecx       \n\t"
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm7 \n\t"
+            "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
+
+         "sub_8lp:                        \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
+            "paddb %%mm7, %%mm0           \n\t"
+            "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
+            "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
+
+            // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
+            // This will be repeated for each group of 8 bytes with the 8th
+            // group being used as the Raw(x-bpp) for the 1st group of the
+            // next loop.
+
+            "paddb %%mm0, %%mm1           \n\t"
+            "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
+            "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
+            "paddb %%mm1, %%mm2           \n\t"
+            "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
+            "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
+            "paddb %%mm2, %%mm3           \n\t"
+            "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
+            "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
+            "paddb %%mm3, %%mm4           \n\t"
+            "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
+            "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
+            "paddb %%mm4, %%mm5           \n\t"
+            "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
+            "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
+            "paddb %%mm5, %%mm6           \n\t"
+            "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
+            "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
+            "addl $64, %%edx              \n\t"
+            "paddb %%mm6, %%mm7           \n\t"
+            "cmpl %%ecx, %%edx            \n\t"
+            "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
+            "jb sub_8lp                   \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "jnb sub_8lt8                 \n\t"
+
+         "sub_8lpA:                       \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm7, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
+            "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
+                                                //  to mm1 to be new Raw(x-bpp)
+                                                //  for next loop
+            "jb sub_8lpA                  \n\t"
+
+         "sub_8lt8:                       \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%ecx", "%edx", "%esi"            // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+         );
+      }
+      break;
+
+      default:                // bpp greater than 8 bytes   GRR BOGUS
+      {
+         __asm__ __volatile__ (
+            "movl _dif, %%edx             \n\t"
+// preload  "movl row, %%edi              \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+
+         "sub_Alp:                        \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "movq (%%esi,%%edx,), %%mm1   \n\t"
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
+                                                //  -8 to offset addl edx
+            "jb sub_Alp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1"
+#endif
+         );
+      }
+      break;
+
+   } // end switch (bpp)
+
+   __asm__ __volatile__ (
+      "movl _MMXLength, %%edx       \n\t"
+//pre "movl row, %%edi              \n\t"
+      "cmpl _FullLength, %%edx      \n\t"
+      "jnb sub_end                  \n\t"
+
+      "movl %%edi, %%esi            \n\t" // lp = row
+//pre "movl bpp, %%eax              \n\t"
+      "addl %%eax, %%edi            \n\t" // rp = row + bpp
+      "xorl %%eax, %%eax            \n\t"
+
+   "sub_lp2:                        \n\t"
+      "movb (%%esi,%%edx,), %%al    \n\t"
+      "addb %%al, (%%edi,%%edx,)    \n\t"
+      "incl %%edx                   \n\t"
+      "cmpl _FullLength, %%edx      \n\t"
+      "jb sub_lp2                   \n\t"
+
+   "sub_end:                        \n\t"
+      "EMMS                         \n\t" // end MMX instructions
+
+      : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+        "=D" (dummy_value_D)    // 1
+
+      : "0" (bpp),              // eax    // input regs
+        "1" (row)               // edi
+
+      : "%edx", "%esi"                    // clobber list
+   );
+
+} // end of png_read_filter_row_mmx_sub()
+#endif
+
+
+
+
+//===========================================================================//
+//                                                                           //
+//            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
+//                                                                           //
+//===========================================================================//
+
+// Optimized code for PNG Up filter decoder
+
+static void /* PRIVATE */
+png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
+                           png_bytep prev_row)
+{
+   png_uint_32 len;
+   int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
+   int dummy_value_S;
+   int dummy_value_D;
+
+   len = row_info->rowbytes;              // number of bytes to filter
+
+   __asm__ __volatile__ (
+//pre "movl row, %%edi              \n\t"
+      // get # of bytes to alignment
+#ifdef __PIC__
+      "pushl %%ebx                  \n\t"
+#endif
+      "movl %%edi, %%ecx            \n\t"
+      "xorl %%ebx, %%ebx            \n\t"
+      "addl $0x7, %%ecx             \n\t"
+      "xorl %%eax, %%eax            \n\t"
+      "andl $0xfffffff8, %%ecx      \n\t"
+//pre "movl prev_row, %%esi         \n\t"
+      "subl %%edi, %%ecx            \n\t"
+      "jz up_go                     \n\t"
+
+   "up_lp1:                         \n\t" // fix alignment
+      "movb (%%edi,%%ebx,), %%al    \n\t"
+      "addb (%%esi,%%ebx,), %%al    \n\t"
+      "incl %%ebx                   \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
+      "jb up_lp1                    \n\t" //  offset incl ebx
+
+   "up_go:                          \n\t"
+//pre "movl len, %%edx              \n\t"
+      "movl %%edx, %%ecx            \n\t"
+      "subl %%ebx, %%edx            \n\t" // subtract alignment fix
+      "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
+      "subl %%edx, %%ecx            \n\t" // drop over bytes from length
+
+      // unrolled loop - use all MMX registers and interleave to reduce
+      // number of branch instructions (loops) and reduce partial stalls
+   "up_loop:                        \n\t"
+      "movq (%%esi,%%ebx,), %%mm1   \n\t"
+      "movq (%%edi,%%ebx,), %%mm0   \n\t"
+      "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
+      "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
+      "movq %%mm0, (%%edi,%%ebx,)   \n\t"
+      "paddb %%mm3, %%mm2           \n\t"
+      "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
+      "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
+      "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
+      "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
+      "paddb %%mm5, %%mm4           \n\t"
+      "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
+      "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
+      "paddb %%mm7, %%mm6           \n\t"
+      "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
+      "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
+      "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
+      "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
+      "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
+      "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
+      "paddb %%mm3, %%mm2           \n\t"
+      "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
+      "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
+      "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
+      "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
+      "paddb %%mm5, %%mm4           \n\t"
+      "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
+      "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
+      "addl $64, %%ebx              \n\t"
+      "paddb %%mm7, %%mm6           \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
+      "jb up_loop                   \n\t" //  -8 to offset addl ebx
+
+      "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
+      "jz up_end                    \n\t"
+
+      "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
+      "jb up_lt8                    \n\t" //  [added by lcreeve at netins.net]
+
+      "addl %%edx, %%ecx            \n\t"
+      "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
+      "subl %%edx, %%ecx            \n\t" // drop over bytes from length
+      "jz up_lt8                    \n\t"
+
+   "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
+      "movq (%%esi,%%ebx,), %%mm1   \n\t"
+      "movq (%%edi,%%ebx,), %%mm0   \n\t"
+      "addl $8, %%ebx               \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
+      "jb up_lpA                    \n\t" //  offset add ebx
+      "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
+      "jz up_end                    \n\t"
+
+   "up_lt8:                         \n\t"
+      "xorl %%eax, %%eax            \n\t"
+      "addl %%edx, %%ecx            \n\t" // move over byte count into counter
+
+   "up_lp2:                         \n\t" // use x86 regs for remaining bytes
+      "movb (%%edi,%%ebx,), %%al    \n\t"
+      "addb (%%esi,%%ebx,), %%al    \n\t"
+      "incl %%ebx                   \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
+      "jb up_lp2                    \n\t" //  offset inc ebx
+
+   "up_end:                         \n\t"
+      "EMMS                         \n\t" // conversion of filtered row complete
+#ifdef __PIC__
+      "popl %%ebx                   \n\t"
+#endif
+
+      : "=d" (dummy_value_d),   // 0      // output regs (dummy)
+        "=S" (dummy_value_S),   // 1
+        "=D" (dummy_value_D)    // 2
+
+      : "0" (len),              // edx    // input regs
+        "1" (prev_row),         // esi
+        "2" (row)               // edi
+
+      : "%eax", "%ecx"            // clobber list (no input regs!)
+#ifndef __PIC__
+      , "%ebx"
+#endif
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+      , "%mm0", "%mm1", "%mm2", "%mm3"
+      , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
+   );
+
+} // end of png_read_filter_row_mmx_up()
+
+#endif /* PNG_MMX_CODE_SUPPORTED */
+
+
+
+
+/*===========================================================================*/
+/*                                                                           */
+/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
+/*                                                                           */
+/*===========================================================================*/
+
+
+/* Optimized png_read_filter_row routines */
+
+void /* PRIVATE */
+png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
+   row, png_bytep prev_row, int filter)
+{
+#ifdef PNG_DEBUG
+   char filnm[10];
+#endif
+
+#if defined(PNG_MMX_CODE_SUPPORTED)
+/* GRR:  these are superseded by png_ptr->asm_flags: */
+#define UseMMX_sub    1   // GRR:  converted 20000730
+#define UseMMX_up     1   // GRR:  converted 20000729
+#define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
+#define UseMMX_paeth  1   // GRR:  converted 20000828
+
+   if (_mmx_supported == 2) {
+       /* this should have happened in png_init_mmx_flags() already */
+#if !defined(PNG_1_0_X)
+       png_warning(png_ptr, "asm_flags may not have been initialized");
+#endif
+       png_mmx_support();
+   }
+#endif /* PNG_MMX_CODE_SUPPORTED */
+
+#ifdef PNG_DEBUG
+   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
+   switch (filter)
+   {
+      case 0: sprintf(filnm, "none");
+         break;
+      case 1: sprintf(filnm, "sub-%s",
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
+#endif
+#endif
+"x86");
+         break;
+      case 2: sprintf(filnm, "up-%s",
+#ifdef PNG_MMX_CODE_SUPPORTED
+#if !defined(PNG_1_0_X)
+        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
+#endif
+#endif
+ "x86");
+         break;
+      case 3: sprintf(filnm, "avg-%s",
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
+#endif
+#endif
+ "x86");
+         break;
+      case 4: sprintf(filnm, "Paeth-%s",
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
+#endif
+#endif
+"x86");
+         break;
+      default: sprintf(filnm, "unknw");
+         break;
+   }
+   png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
+   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
+   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
+      (int)((row_info->pixel_depth + 7) >> 3));
+   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
+#endif /* PNG_DEBUG */
+
+   switch (filter)
+   {
+      case PNG_FILTER_VALUE_NONE:
+         break;
+
+      case PNG_FILTER_VALUE_SUB:
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
+             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
+             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
+#else
+         if (_mmx_supported)
+#endif
+         {
+            png_read_filter_row_mmx_sub(row_info, row);
+         }
+         else
+#endif /* PNG_MMX_CODE_SUPPORTED */
+         {
+            png_uint_32 i;
+            png_uint_32 istop = row_info->rowbytes;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_bytep rp = row + bpp;
+            png_bytep lp = row;
+
+            for (i = bpp; i < istop; i++)
+            {
+               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
+               rp++;
+            }
+         }  /* end !UseMMX_sub */
+         break;
+
+      case PNG_FILTER_VALUE_UP:
+#if defined(PNG_MMX_CODE_SUPPORTED)
+#if !defined(PNG_1_0_X)
+         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
+             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
+             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
+#else
+         if (_mmx_supported)
+#endif
+         {
+            png_read_filter_row_mmx_up(row_info, row, prev_row);
+         }
+          else
+#endif /* PNG_MMX_CODE_SUPPORTED */
+         {
+            png_uint_32 i;
+            png_uint_32 istop = row_info->rowbytes;
+            png_bytep rp = row;
+            png_bytep pp = prev_row;
+
+            for (i = 0; i < istop; ++i)
+            {
+               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+               rp++;
+            }
+         }  /* end !UseMMX_up */
+         break;
+
+      case PNG_FILTER_VALUE_AVG:
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
+             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
+             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
+#else
+         if (_mmx_supported)
+#endif
+         {
+            png_read_filter_row_mmx_avg(row_info, row, prev_row);
+         }
+         else
+#endif /* PNG_MMX_CODE_SUPPORTED */
+         {
+            png_uint_32 i;
+            png_bytep rp = row;
+            png_bytep pp = prev_row;
+            png_bytep lp = row;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_uint_32 istop = row_info->rowbytes - bpp;
+
+            for (i = 0; i < bpp; i++)
+            {
+               *rp = (png_byte)(((int)(*rp) +
+                  ((int)(*pp++) >> 1)) & 0xff);
+               rp++;
+            }
+
+            for (i = 0; i < istop; i++)
+            {
+               *rp = (png_byte)(((int)(*rp) +
+                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
+               rp++;
+            }
+         }  /* end !UseMMX_avg */
+         break;
+
+      case PNG_FILTER_VALUE_PAETH:
+#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
+#if !defined(PNG_1_0_X)
+         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
+             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
+             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
+#else
+         if (_mmx_supported)
+#endif
+         {
+            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
+         }
+         else
+#endif /* PNG_MMX_CODE_SUPPORTED */
+         {
+            png_uint_32 i;
+            png_bytep rp = row;
+            png_bytep pp = prev_row;
+            png_bytep lp = row;
+            png_bytep cp = prev_row;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_uint_32 istop = row_info->rowbytes - bpp;
+
+            for (i = 0; i < bpp; i++)
+            {
+               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+               rp++;
+            }
+
+            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
+            {
+               int a, b, c, pa, pb, pc, p;
+
+               a = *lp++;
+               b = *pp++;
+               c = *cp++;
+
+               p = b - c;
+               pc = a - c;
+
+#ifdef PNG_USE_ABS
+               pa = abs(p);
+               pb = abs(pc);
+               pc = abs(p + pc);
+#else
+               pa = p < 0 ? -p : p;
+               pb = pc < 0 ? -pc : pc;
+               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
+#endif
+
+               /*
+                  if (pa <= pb && pa <= pc)
+                     p = a;
+                  else if (pb <= pc)
+                     p = b;
+                  else
+                     p = c;
+                */
+
+               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
+
+               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
+               rp++;
+            }
+         }  /* end !UseMMX_paeth */
+         break;
+
+      default:
+         png_warning(png_ptr, "Ignoring bad row-filter type");
+         *row=0;
+         break;
+   }
+}
+
+#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
+
+
+/*===========================================================================*/
+/*                                                                           */
+/*                      P N G _ M M X _ S U P P O R T                        */
+/*                                                                           */
+/*===========================================================================*/
+
+/* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
+ *             (2) all instructions compile with gcc 2.7.2.3 and later
+ *             (3) the function is moved down here to prevent gcc from
+ *                  inlining it in multiple places and then barfing be-
+ *                  cause the ".NOT_SUPPORTED" label is multiply defined
+ *             [is there a way to signal that a *single* function should
+ *              not be inlined?  is there a way to modify the label for
+ *              each inlined instance, e.g., by appending _1, _2, etc.?
+ *              maybe if don't use leading "." in label name? (nope...sigh)]
+ */
+
+int PNGAPI
+png_mmx_support(void)
+{
+#if defined(PNG_MMX_CODE_SUPPORTED)
+    int result;
+    __asm__ __volatile__ (
+        "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
+        "pushl %%ecx          \n\t"  // so does ecx...
+        "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
+//      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
+//      "pushf                \n\t"  // 16-bit pushf
+        "pushfl               \n\t"  // save Eflag to stack
+        "popl %%eax           \n\t"  // get Eflag from stack into eax
+        "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
+        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
+        "pushl %%eax          \n\t"  // save modified Eflag back to stack
+//      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
+//      "popf                 \n\t"  // 16-bit popf
+        "popfl                \n\t"  // restore modified value to Eflag reg
+        "pushfl               \n\t"  // save Eflag to stack
+        "popl %%eax           \n\t"  // get Eflag from stack
+        "pushl %%ecx          \n\t"  // save original Eflag to stack
+        "popfl                \n\t"  // restore original Eflag
+        "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
+        "jz 0f                \n\t"  // if same, CPUID instr. is not supported
+
+        "xorl %%eax, %%eax    \n\t"  // set eax to zero
+//      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
+        "cpuid                \n\t"  // get the CPU identification info
+        "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
+        "jl 0f                \n\t"  // if eax is zero, MMX is not supported
+
+        "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
+        "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
+                                     // faster than the instruction "mov eax, 1"
+        "cpuid                \n\t"  // get the CPU identification info again
+        "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
+        "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
+        "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
+
+        "movl $1, %%eax       \n\t"  // set return value to 1
+        "jmp  1f              \n\t"  // DONE:  have MMX support
+
+    "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
+        "movl $0, %%eax       \n\t"  // set return value to 0
+    "1:                       \n\t"  // .RETURN: target label for jump instructions
+        "popl %%edx           \n\t"  // restore edx
+        "popl %%ecx           \n\t"  // restore ecx
+        "popl %%ebx           \n\t"  // restore ebx
+
+//      "ret                  \n\t"  // DONE:  no MMX support
+                                     // (fall through to standard C "ret")
+
+        : "=a" (result)              // output list
+
+        :                            // any variables used on input (none)
+
+                                     // no clobber list
+//      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
+//      , "memory"   // if write to a variable gcc thought was in a reg
+//      , "cc"       // "condition codes" (flag bits)
+    );
+    _mmx_supported = result;
+#else
+    _mmx_supported = 0;
+#endif /* PNG_MMX_CODE_SUPPORTED */
+
+    return _mmx_supported;
+}
+
+
+#endif /* PNG_USE_PNGGCCRD */
--- a/modules/libimg/png/pngget.c
+++ b/modules/libimg/png/pngget.c
@@ -507,21 +507,18 @@ png_get_iCCP(png_structp png_ptr, png_in
 #endif
 
 #if defined(PNG_sPLT_SUPPORTED)
 png_uint_32 PNGAPI
 png_get_sPLT(png_structp png_ptr, png_infop info_ptr,
              png_sPLT_tpp spalettes)
 {
    if (png_ptr != NULL && info_ptr != NULL && spalettes != NULL)
-   {
      *spalettes = info_ptr->splt_palettes;
-     return ((png_uint_32)info_ptr->splt_palettes_num);
-   }
-   return (0);
+   return ((png_uint_32)info_ptr->splt_palettes_num);
 }
 #endif
 
 #if defined(PNG_hIST_SUPPORTED)
 png_uint_32 PNGAPI
 png_get_hIST(png_structp png_ptr, png_infop info_ptr, png_uint_16p *hist)
 {
    if (png_ptr != NULL && info_ptr != NULL && (info_ptr->valid & PNG_INFO_hIST)
@@ -947,33 +944,30 @@ png_get_next_frame_blend_op(png_structp 
 }
 
 png_byte PNGAPI
 png_get_first_frame_is_hidden(png_structp png_ptr, png_infop info_ptr)
 {
     png_debug(1, "in png_first_frame_is_hidden()\n");
     
     if (png_ptr != NULL)
-       return (png_byte)(png_ptr->apng_flags & PNG_FIRST_FRAME_HIDDEN);
+        return png_ptr->apng_flags & PNG_FIRST_FRAME_HIDDEN;
     
     return 0;
 }
 #endif /* PNG_APNG_SUPPORTED */
 
 #if defined(PNG_UNKNOWN_CHUNKS_SUPPORTED)
 png_uint_32 PNGAPI
 png_get_unknown_chunks(png_structp png_ptr, png_infop info_ptr,
              png_unknown_chunkpp unknowns)
 {
    if (png_ptr != NULL && info_ptr != NULL && unknowns != NULL)
-   {
      *unknowns = info_ptr->unknown_chunks;
-     return ((png_uint_32)info_ptr->unknown_chunks_num);
-   }
-   return (0);
+   return ((png_uint_32)info_ptr->unknown_chunks_num);
 }
 #endif
 
 #if defined(PNG_READ_RGB_TO_GRAY_SUPPORTED)
 png_byte PNGAPI
 png_get_rgb_to_gray_status (png_structp png_ptr)
 {
    return (png_byte)(png_ptr? png_ptr->rgb_to_gray_status : 0);
@@ -997,54 +991,115 @@ png_get_compression_buffer_size(png_stru
 #endif
 
 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
 #ifndef PNG_1_0_X
 /* this function was added to libpng 1.2.0 and should exist by default */
 png_uint_32 PNGAPI
 png_get_asm_flags (png_structp png_ptr)
 {
-    /* obsolete, to be removed from libpng-1.4.0 */
+#ifdef PNG_MMX_CODE_SUPPORTED
+    return (png_uint_32)(png_ptr? png_ptr->asm_flags : 0L);
+#else
     return (png_ptr? 0L: 0L);
+#endif
 }
 
 /* this function was added to libpng 1.2.0 and should exist by default */
 png_uint_32 PNGAPI
 png_get_asm_flagmask (int flag_select)
 {
-    /* obsolete, to be removed from libpng-1.4.0 */
-    flag_select=flag_select;
-    return 0L;
+#ifdef PNG_MMX_CODE_SUPPORTED
+    png_uint_32 settable_asm_flags = 0;
+
+    if (flag_select & PNG_SELECT_READ)
+        settable_asm_flags |=
+          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
+          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
+          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
+          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
+          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
+          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
+          /* no non-MMX flags yet */
+
+#if 0
+    /* GRR:  no write-flags yet, either, but someday... */
+    if (flag_select & PNG_SELECT_WRITE)
+        settable_asm_flags |=
+          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
+#endif /* 0 */
+
+    return settable_asm_flags;  /* _theoretically_ settable capabilities only */
+#else
+    return (0L);
+#endif /* PNG_MMX_CODE_SUPPORTED */
 }
 
+
     /* GRR:  could add this:   && defined(PNG_MMX_CODE_SUPPORTED) */
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_flagmask (int flag_select, int *compilerID)
 {
-    /* obsolete, to be removed from libpng-1.4.0 */
-    flag_select=flag_select;
-    *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
-    return 0L;
+#if defined(PNG_MMX_CODE_SUPPORTED)
+    png_uint_32 settable_mmx_flags = 0;
+
+    if (flag_select & PNG_SELECT_READ)
+        settable_mmx_flags |=
+          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
+          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
+          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
+          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
+          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
+          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
+#if 0
+    /* GRR:  no MMX write support yet, but someday... */
+    if (flag_select & PNG_SELECT_WRITE)
+        settable_mmx_flags |=
+          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
+#endif /* 0 */
+
+    if (compilerID != NULL) {
+#ifdef PNG_USE_PNGVCRD
+        *compilerID = 1;    /* MSVC */
+#else
+#ifdef PNG_USE_PNGGCCRD
+        *compilerID = 2;    /* gcc/gas */
+#else
+        *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
+#endif
+#endif
+    }
+
+    return settable_mmx_flags;  /* _theoretically_ settable capabilities only */
+#else
+    return (0L);
+#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 
 /* this function was added to libpng 1.2.0 */
 png_byte PNGAPI
 png_get_mmx_bitdepth_threshold (png_structp png_ptr)
 {
-    /* obsolete, to be removed from libpng-1.4.0 */
+#if defined(PNG_MMX_CODE_SUPPORTED)
+    return (png_byte)(png_ptr? png_ptr->mmx_bitdepth_threshold : 0);
+#else
     return (png_ptr? 0: 0);
+#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_rowbytes_threshold (png_structp png_ptr)
 {
-    /* obsolete, to be removed from libpng-1.4.0 */
+#if defined(PNG_MMX_CODE_SUPPORTED)
+    return (png_uint_32)(png_ptr? png_ptr->mmx_rowbytes_threshold : 0L);
+#else
     return (png_ptr? 0L: 0L);
+#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 #endif /* ?PNG_1_0_X */
 #endif /* ?PNG_ASSEMBLER_CODE_SUPPORTED */
 
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
 /* these functions were added to libpng 1.2.6 */
 png_uint_32 PNGAPI
 png_get_user_width_max (png_structp png_ptr)
@@ -1052,11 +1107,10 @@ png_get_user_width_max (png_structp png_
     return (png_ptr? png_ptr->user_width_max : 0);
 }
 png_uint_32 PNGAPI
 png_get_user_height_max (png_structp png_ptr)
 {
     return (png_ptr? png_ptr->user_height_max : 0);
 }
 #endif /* ?PNG_SET_USER_LIMITS_SUPPORTED */
- 
 
 #endif /* PNG_READ_SUPPORTED || PNG_WRITE_SUPPORTED */
--- a/modules/libimg/png/pngpread.c
+++ b/modules/libimg/png/pngpread.c
@@ -1,14 +1,14 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * Last changed in libpng 1.2.21 October 4, 2007
+ * Last changed in libpng 1.2.13 November 13, 2006
  * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
+ * Copyright (c) 1998-2006 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  */
 
 #define PNG_INTERNAL
 #include "png.h"
 
 #ifdef PNG_PROGRESSIVE_READ_SUPPORTED
@@ -132,75 +132,75 @@ png_push_read_sig(png_structp png_ptr, p
       }
    }
 }
 
 void /* PRIVATE */
 png_push_read_chunk(png_structp png_ptr, png_infop info_ptr)
 {
 #ifdef PNG_USE_LOCAL_ARRAYS
-      PNG_CONST PNG_IHDR;
-      PNG_CONST PNG_IDAT;
-      PNG_CONST PNG_IEND;
-      PNG_CONST PNG_PLTE;
+      PNG_IHDR;
+      PNG_IDAT;
+      PNG_IEND;
+      PNG_PLTE;
 #if defined(PNG_READ_bKGD_SUPPORTED)
-      PNG_CONST PNG_bKGD;
+      PNG_bKGD;
 #endif
 #if defined(PNG_READ_cHRM_SUPPORTED)
-      PNG_CONST PNG_cHRM;
+      PNG_cHRM;
 #endif
 #if defined(PNG_READ_gAMA_SUPPORTED)
-      PNG_CONST PNG_gAMA;
+      PNG_gAMA;
 #endif
 #if defined(PNG_READ_hIST_SUPPORTED)
-      PNG_CONST PNG_hIST;
+      PNG_hIST;
 #endif
 #if defined(PNG_READ_iCCP_SUPPORTED)
-      PNG_CONST PNG_iCCP;
+      PNG_iCCP;
 #endif
 #if defined(PNG_READ_iTXt_SUPPORTED)
-      PNG_CONST PNG_iTXt;
+      PNG_iTXt;
 #endif
 #if defined(PNG_READ_oFFs_SUPPORTED)
-      PNG_CONST PNG_oFFs;
+      PNG_oFFs;
 #endif
 #if defined(PNG_READ_pCAL_SUPPORTED)
-      PNG_CONST PNG_pCAL;
+      PNG_pCAL;
 #endif
 #if defined(PNG_READ_pHYs_SUPPORTED)
-      PNG_CONST PNG_pHYs;
+      PNG_pHYs;
 #endif
 #if defined(PNG_READ_sBIT_SUPPORTED)
-      PNG_CONST PNG_sBIT;
+      PNG_sBIT;
 #endif
 #if defined(PNG_READ_sCAL_SUPPORTED)
-      PNG_CONST PNG_sCAL;
+      PNG_sCAL;
 #endif
 #if defined(PNG_READ_sRGB_SUPPORTED)
-      PNG_CONST PNG_sRGB;
+      PNG_sRGB;
 #endif
 #if defined(PNG_READ_sPLT_SUPPORTED)
-      PNG_CONST PNG_sPLT;
+      PNG_sPLT;
 #endif
 #if defined(PNG_READ_tEXt_SUPPORTED)
-      PNG_CONST PNG_tEXt;
+      PNG_tEXt;
 #endif
 #if defined(PNG_READ_tIME_SUPPORTED)
-      PNG_CONST PNG_tIME;
+      PNG_tIME;
 #endif
 #if defined(PNG_READ_tRNS_SUPPORTED)
-      PNG_CONST PNG_tRNS;
+      PNG_tRNS;
 #endif
 #if defined(PNG_READ_zTXt_SUPPORTED)
-      PNG_CONST PNG_zTXt;
+      PNG_zTXt;
 #endif
 #if defined(PNG_READ_APNG_SUPPORTED)
-      PNG_CONST PNG_acTL;
-      PNG_CONST PNG_fcTL;
-      PNG_CONST PNG_fdAT;
+      PNG_acTL;
+      PNG_fcTL;
+      PNG_fdAT;
 #endif
 #endif /* PNG_USE_LOCAL_ARRAYS */
    /* First we make sure we have enough data for the 4 byte chunk name
     * and the 4 byte chunk length before proceeding with decoding the
     * chunk data.  To fully decode each of these chunks, we also make
     * sure we have enough data in the buffer for the 4 byte CRC at the
     * end of every chunk (except IDAT, which is handled separately).
     */
@@ -313,17 +313,17 @@ png_push_read_chunk(png_structp png_ptr,
          png_ptr->mode &= ~PNG_HAVE_CHUNK_HEADER;
          return;
       }
       
       return;
    }
 #endif /* PNG_READ_APNG_SUPPORTED */
    
-   if (!png_memcmp(png_ptr->chunk_name, png_IDAT, 4))
+   if (!png_memcmp(png_ptr->chunk_name, (png_bytep)png_IDAT, 4))
      if(png_ptr->mode & PNG_AFTER_IDAT)
         png_ptr->mode |= PNG_HAVE_CHUNK_AFTER_IDAT;
 
    if (!png_memcmp(png_ptr->chunk_name, png_IHDR, 4))
    {
       if (png_ptr->push_length + 4 > png_ptr->buffer_size)
       {
          png_push_save_buffer(png_ptr);
@@ -370,17 +370,17 @@ png_push_read_chunk(png_structp png_ptr,
    {
       if (png_ptr->push_length + 4 > png_ptr->buffer_size)
       {
          png_push_save_buffer(png_ptr);
          return;
       }
       png_handle_PLTE(png_ptr, info_ptr, png_ptr->push_length);
    }
-   else if (!png_memcmp(png_ptr->chunk_name, png_IDAT, 4))
+   else if (!png_memcmp(png_ptr->chunk_name, (png_bytep)png_IDAT, 4))
    {
       /* If we reach an IDAT chunk, this means we have read all of the
        * header chunks, and we can start reading the image (or if this
        * is called after the image has been read - we have an error).
        */
      if (!(png_ptr->mode & PNG_HAVE_IHDR))
        png_error(png_ptr, "Missing IHDR before IDAT");
      else if (png_ptr->color_type == PNG_COLOR_TYPE_PALETTE &&
@@ -841,17 +841,17 @@ png_push_read_IDAT(png_structp png_ptr)
                                    "APNG chunks");
               png_crc_finish(png_ptr, png_ptr->push_length);
               png_ptr->mode &= ~PNG_HAVE_CHUNK_HEADER;
               return;
           }
       }
       else 
 #endif
-      if ( png_memcmp(png_ptr->chunk_name, png_IDAT, 4)
+      if ( png_memcmp(png_ptr->chunk_name, (png_bytep)png_IDAT, 4)
                 && (png_ptr->num_frames_read == 0) )
       {
          png_ptr->process_mode = PNG_READ_CHUNK_MODE;
          if (!(png_ptr->flags & PNG_FLAG_ZLIB_FINISHED))
             png_error(png_ptr, "Not enough compressed data");
 #if defined(PNG_READ_APNG_SUPPORTED)
          if (png_ptr->frame_end_fn != NULL)
             (*(png_ptr->frame_end_fn))(png_ptr, png_ptr->num_frames_read);
@@ -1167,30 +1167,35 @@ png_push_process_row(png_structp png_ptr
 
 void /* PRIVATE */
 png_read_push_finish_row(png_structp png_ptr)
 {
 #ifdef PNG_USE_LOCAL_ARRAYS
    /* arrays to facilitate easy interlacing - use pass (0 - 6) as index */
 
    /* start of interlace block */
-   PNG_CONST int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
+   const int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
 
    /* offset to next interlace block */
-   PNG_CONST int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+   const int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
 
    /* start of interlace block in the y direction */
-   PNG_CONST int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
+   const int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
 
    /* offset to next interlace block in the y direction */
-   PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+   const int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+
+   /* Width of interlace block.  This is not currently used - if you need
+    * it, uncomment it here and in png.h
+   const int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
+   */
 
    /* Height of interlace block.  This is not currently used - if you need
     * it, uncomment it here and in png.h
-   PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
+   const int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
    */
 #endif
 
    png_ptr->row_number++;
    if (png_ptr->row_number < png_ptr->num_rows)
       return;
 
    if (png_ptr->interlaced)
@@ -1234,17 +1239,18 @@ png_read_push_finish_row(png_structp png
 #if defined(PNG_READ_tEXt_SUPPORTED)
 void /* PRIVATE */
 png_push_handle_tEXt(png_structp png_ptr, png_infop info_ptr, png_uint_32
    length)
 {
    if (!(png_ptr->mode & PNG_HAVE_IHDR) || (png_ptr->mode & PNG_HAVE_IEND))
       {
          png_error(png_ptr, "Out of place tEXt");
-         info_ptr = info_ptr; /* to quiet some compiler warnings */
+         /* to quiet some compiler warnings */
+         if(info_ptr == NULL) return;
       }
 
 #ifdef PNG_MAX_MALLOC_64K
    png_ptr->skip_length = 0;  /* This may not be necessary */
 
    if (length > (png_uint_32)65535L) /* Can't hold entire string in memory */
    {
       png_warning(png_ptr, "tEXt chunk too large to fit in memory");
@@ -1297,17 +1303,17 @@ png_push_read_tEXt(png_structp png_ptr, 
          return;
 #endif
 
       key = png_ptr->current_text;
 
       for (text = key; *text; text++)
          /* empty loop */ ;
 
-      if (text < key + png_ptr->current_text_size)
+      if (text != key + png_ptr->current_text_size)
          text++;
 
       text_ptr = (png_textp)png_malloc(png_ptr,
          (png_uint_32)png_sizeof(png_text));
       text_ptr->compression = PNG_TEXT_COMPRESSION_NONE;
       text_ptr->key = key;
 #ifdef PNG_iTXt_SUPPORTED
       text_ptr->lang = NULL;
@@ -1330,17 +1336,18 @@ png_push_read_tEXt(png_structp png_ptr, 
 #if defined(PNG_READ_zTXt_SUPPORTED)
 void /* PRIVATE */
 png_push_handle_zTXt(png_structp png_ptr, png_infop info_ptr, png_uint_32
    length)
 {
    if (!(png_ptr->mode & PNG_HAVE_IHDR) || (png_ptr->mode & PNG_HAVE_IEND))
       {
          png_error(png_ptr, "Out of place zTXt");
-         info_ptr = info_ptr; /* to quiet some compiler warnings */
+         /* to quiet some compiler warnings */
+         if(info_ptr == NULL) return;
       }
 
 #ifdef PNG_MAX_MALLOC_64K
    /* We can't handle zTXt chunks > 64K, since we don't have enough space
     * to be able to store the uncompressed data.  Actually, the threshold
     * is probably around 32K, but it isn't as definite as 64K is.
     */
    if (length > (png_uint_32)65535L)
@@ -1392,17 +1399,17 @@ png_push_read_zTXt(png_structp png_ptr, 
       png_push_crc_finish(png_ptr);
 
       key = png_ptr->current_text;
 
       for (text = key; *text; text++)
          /* empty loop */ ;
 
       /* zTXt can't have zero text */
-      if (text >= key + png_ptr->current_text_size)
+      if (text == key + png_ptr->current_text_size)
       {
          png_ptr->current_text = NULL;
          png_free(png_ptr, key);
          return;
       }
 
       text++;
 
@@ -1522,17 +1529,18 @@ png_push_read_zTXt(png_structp png_ptr, 
 #if defined(PNG_READ_iTXt_SUPPORTED)
 void /* PRIVATE */
 png_push_handle_iTXt(png_structp png_ptr, png_infop info_ptr, png_uint_32
    length)
 {
    if (!(png_ptr->mode & PNG_HAVE_IHDR) || (png_ptr->mode & PNG_HAVE_IEND))
       {
          png_error(png_ptr, "Out of place iTXt");
-         info_ptr = info_ptr; /* to quiet some compiler warnings */
+         /* to quiet some compiler warnings */
+         if(info_ptr == NULL) return;
       }
 
 #ifdef PNG_MAX_MALLOC_64K
    png_ptr->skip_length = 0;  /* This may not be necessary */
 
    if (length > (png_uint_32)65535L) /* Can't hold entire string in memory */
    {
       png_warning(png_ptr, "iTXt chunk too large to fit in memory");
@@ -1589,34 +1597,30 @@ png_push_read_iTXt(png_structp png_ptr, 
          return;
 #endif
 
       key = png_ptr->current_text;
 
       for (lang = key; *lang; lang++)
          /* empty loop */ ;
 
-      if (lang < key + png_ptr->current_text_size - 3)
+      if (lang != key + png_ptr->current_text_size)
          lang++;
 
       comp_flag = *lang++;
       lang++;     /* skip comp_type, always zero */
 
       for (lang_key = lang; *lang_key; lang_key++)
          /* empty loop */ ;
       lang_key++;        /* skip NUL separator */
 
-      text=lang_key;
-      if (lang_key < key + png_ptr->current_text_size - 1)
-      {
-        for (; *text; text++)
-           /* empty loop */ ;
-      }
+      for (text = lang_key; *text; text++)
+         /* empty loop */ ;
 
-      if (text < key + png_ptr->current_text_size)
+      if (text != key + png_ptr->current_text_size)
          text++;
 
       text_ptr = (png_textp)png_malloc(png_ptr,
          (png_uint_32)png_sizeof(png_text));
       text_ptr->compression = comp_flag + 2;
       text_ptr->key = key;
       text_ptr->lang = lang;
       text_ptr->lang_key = lang_key;
@@ -1644,68 +1648,65 @@ png_push_handle_unknown(png_structp png_
    length)
 {
    png_uint_32 skip=0;
    png_check_chunk_name(png_ptr, png_ptr->chunk_name);
 
    if (!(png_ptr->chunk_name[0] & 0x20))
    {
 #if defined(PNG_READ_UNKNOWN_CHUNKS_SUPPORTED)
-     if(png_handle_as_unknown(png_ptr, png_ptr->chunk_name) !=
-          PNG_HANDLE_CHUNK_ALWAYS
+      if(png_handle_as_unknown(png_ptr, png_ptr->chunk_name) !=
+           PNG_HANDLE_CHUNK_ALWAYS
 #if defined(PNG_READ_USER_CHUNKS_SUPPORTED)
-          && png_ptr->read_user_chunk_fn == NULL
+           && png_ptr->read_user_chunk_fn == NULL
 #endif
-        )
+         )
 #endif
-        png_chunk_error(png_ptr, "unknown critical chunk");
+         png_chunk_error(png_ptr, "unknown critical chunk");
 
-     info_ptr = info_ptr; /* to quiet some compiler warnings */
+      /* to quiet compiler warnings about unused info_ptr */
+      if (info_ptr == NULL)
+         return;
    }
 
 #if defined(PNG_READ_UNKNOWN_CHUNKS_SUPPORTED)
    if (png_ptr->flags & PNG_FLAG_KEEP_UNKNOWN_CHUNKS)
    {
+       png_unknown_chunk chunk;
+
 #ifdef PNG_MAX_MALLOC_64K
-      if (length > (png_uint_32)65535L)
-      {
-          png_warning(png_ptr, "unknown chunk too large to fit in memory");
-          skip = length - (png_uint_32)65535L;
-          length = (png_uint_32)65535L;
-      }
+       if (length > (png_uint_32)65535L)
+       {
+           png_warning(png_ptr, "unknown chunk too large to fit in memory");
+           skip = length - (png_uint_32)65535L;
+           length = (png_uint_32)65535L;
+       }
 #endif
-      png_strncpy((png_charp)png_ptr->unknown_chunk.name,
-	 (png_charp)png_ptr->chunk_name, 5);
-      png_ptr->unknown_chunk.data = (png_bytep)png_malloc(png_ptr, length);
-      png_ptr->unknown_chunk.size = (png_size_t)length;
-      png_crc_read(png_ptr, (png_bytep)png_ptr->unknown_chunk.data, length);
+
+       png_strcpy((png_charp)chunk.name, (png_charp)png_ptr->chunk_name);
+       chunk.data = (png_bytep)png_malloc(png_ptr, length);
+       png_crc_read(png_ptr, chunk.data, length);
+       chunk.size = length;
 #if defined(PNG_READ_USER_CHUNKS_SUPPORTED)
-      if(png_ptr->read_user_chunk_fn != NULL)
-      {
-         /* callback to user unknown chunk handler */
-         int ret;
-         ret = (*(png_ptr->read_user_chunk_fn))
-           (png_ptr, &png_ptr->unknown_chunk);
-         if (ret < 0)
-            png_chunk_error(png_ptr, "error in user chunk");
-         if (ret == 0)
-         {
-            if (!(png_ptr->chunk_name[0] & 0x20))
-               if(png_handle_as_unknown(png_ptr, png_ptr->chunk_name) !=
-                    PNG_HANDLE_CHUNK_ALWAYS)
-                  png_chunk_error(png_ptr, "unknown critical chunk");
-            png_set_unknown_chunks(png_ptr, info_ptr,
-               &png_ptr->unknown_chunk, 1);
-         }
-      }
-#else
-      png_set_unknown_chunks(png_ptr, info_ptr, &png_ptr->unknown_chunk, 1);
+       if(png_ptr->read_user_chunk_fn != NULL)
+       {
+          /* callback to user unknown chunk handler */
+          if ((*(png_ptr->read_user_chunk_fn)) (png_ptr, &chunk) <= 0)
+          {
+             if (!(png_ptr->chunk_name[0] & 0x20))
+                if(png_handle_as_unknown(png_ptr, png_ptr->chunk_name) !=
+                     PNG_HANDLE_CHUNK_ALWAYS)
+                   png_chunk_error(png_ptr, "unknown critical chunk");
+          }
+             png_set_unknown_chunks(png_ptr, info_ptr, &chunk, 1);
+       }
+       else
 #endif
-      png_free(png_ptr, png_ptr->unknown_chunk.data);
-      png_ptr->unknown_chunk.data = NULL;
+          png_set_unknown_chunks(png_ptr, info_ptr, &chunk, 1);
+       png_free(png_ptr, chunk.data);
    }
    else
 #endif
       skip=length;
    png_push_crc_skip(png_ptr, skip);
 }
 
 void /* PRIVATE */
@@ -1730,17 +1731,17 @@ png_push_have_row(png_structp png_ptr, p
          (int)png_ptr->pass);
 }
 
 void PNGAPI
 png_progressive_combine_row (png_structp png_ptr,
    png_bytep old_row, png_bytep new_row)
 {
 #ifdef PNG_USE_LOCAL_ARRAYS
-   PNG_CONST int FARDATA png_pass_dsp_mask[7] =
+   const int FARDATA png_pass_dsp_mask[7] =
       {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
 #endif
    if(png_ptr == NULL) return;
    if (new_row != NULL)    /* new_row must == png_ptr->row_buf here. */
       png_combine_row(png_ptr, old_row, png_pass_dsp_mask[png_ptr->pass]);
 }
 
 void PNGAPI
--- a/modules/libimg/png/pngread.c
+++ b/modules/libimg/png/pngread.c
@@ -1,12 +1,12 @@
 
 /* pngread.c - read a PNG file
  *
- * Last changed in libpng 1.2.20 September 7, 2007
+ * Last changed in libpng 1.2.15 January 5, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  *
  * This file contains routines that an application calls directly to
  * read a PNG file or stream.
  */
@@ -50,16 +50,22 @@ png_create_read_struct_2(png_const_charp
    png_ptr = (png_structp)png_create_struct_2(PNG_STRUCT_PNG,
       (png_malloc_ptr)malloc_fn, (png_voidp)mem_ptr);
 #else
    png_ptr = (png_structp)png_create_struct(PNG_STRUCT_PNG);
 #endif
    if (png_ptr == NULL)
       return (NULL);
 
+#if !defined(PNG_1_0_X)
+#ifdef PNG_MMX_CODE_SUPPORTED
+   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
+#endif
+#endif /* PNG_1_0_X */
+
    /* added at libpng-1.2.6 */
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
    png_ptr->user_width_max=PNG_USER_WIDTH_MAX;
    png_ptr->user_height_max=PNG_USER_HEIGHT_MAX;
 #endif
 
 #ifdef PNG_SETJMP_SUPPORTED
 #ifdef USE_FAR_KEYWORD
@@ -106,23 +112,21 @@ png_create_read_struct_2(png_const_charp
      if (user_png_ver == NULL || user_png_ver[0] != png_libpng_ver[0] ||
          (user_png_ver[0] == '1' && user_png_ver[2] != png_libpng_ver[2]) ||
          (user_png_ver[0] == '0' && user_png_ver[2] < '9'))
      {
 #if !defined(PNG_NO_STDIO) && !defined(_WIN32_WCE)
         char msg[80];
         if (user_png_ver)
         {
-          png_snprintf(msg, 80,
-             "Application was compiled with png.h from libpng-%.20s",
+          sprintf(msg, "Application was compiled with png.h from libpng-%.20s",
              user_png_ver);
           png_warning(png_ptr, msg);
         }
-        png_snprintf(msg, 80,
-             "Application  is  running with png.c from libpng-%.20s",
+        sprintf(msg, "Application  is  running with png.c from libpng-%.20s",
            png_libpng_ver);
         png_warning(png_ptr, msg);
 #endif
 #ifdef PNG_ERROR_NUMBERS_SUPPORTED
         png_ptr->flags=0;
 #endif
         png_error(png_ptr,
            "Incompatible libpng version in application and library");
@@ -188,23 +192,21 @@ png_read_init_2(png_structp png_ptr, png
 #if !defined(PNG_NO_STDIO) && !defined(_WIN32_WCE)
    if(png_sizeof(png_struct) > png_struct_size ||
       png_sizeof(png_info) > png_info_size)
    {
       char msg[80];
       png_ptr->warning_fn=NULL;
       if (user_png_ver)
       {
-        png_snprintf(msg, 80,
-           "Application was compiled with png.h from libpng-%.20s",
+        sprintf(msg, "Application was compiled with png.h from libpng-%.20s",
            user_png_ver);
         png_warning(png_ptr, msg);
       }
-      png_snprintf(msg, 80,
-         "Application  is  running with png.c from libpng-%.20s",
+      sprintf(msg, "Application  is  running with png.c from libpng-%.20s",
          png_libpng_ver);
       png_warning(png_ptr, msg);
    }
 #endif
    if(png_sizeof(png_struct) > png_struct_size)
      {
        png_ptr->error_fn=NULL;
 #ifdef PNG_ERROR_NUMBERS_SUPPORTED
@@ -339,75 +341,75 @@ png_read_info(png_structp png_ptr, png_i
       }
       if (num_checked < 3)
          png_ptr->mode |= PNG_HAVE_PNG_SIGNATURE;
    }
 
    for(;;)
    {
 #ifdef PNG_USE_LOCAL_ARRAYS
-      PNG_CONST PNG_IHDR;
-      PNG_CONST PNG_IDAT;
-      PNG_CONST PNG_IEND;
-      PNG_CONST PNG_PLTE;
+      PNG_IHDR;
+      PNG_IDAT;
+      PNG_IEND;
+      PNG_PLTE;
 #if defined(PNG_READ_bKGD_SUPPORTED)
-      PNG_CONST PNG_bKGD;
+      PNG_bKGD;
 #endif
 #if defined(PNG_READ_cHRM_SUPPORTED)
-      PNG_CONST PNG_cHRM;
+      PNG_cHRM;
 #endif
 #if defined(PNG_READ_gAMA_SUPPORTED)
-      PNG_CONST PNG_gAMA;
+      PNG_gAMA;
 #endif
 #if defined(PNG_READ_hIST_SUPPORTED)
-      PNG_CONST PNG_hIST;
+      PNG_hIST;
 #endif
 #if defined(PNG_READ_iCCP_SUPPORTED)
-      PNG_CONST PNG_iCCP;
+      PNG_iCCP;
 #endif
 #if defined(PNG_READ_iTXt_SUPPORTED)
-      PNG_CONST PNG_iTXt;
+      PNG_iTXt;
 #endif
 #if defined(PNG_READ_oFFs_SUPPORTED)
-      PNG_CONST PNG_oFFs;
+      PNG_oFFs;
 #endif
 #if defined(PNG_READ_pCAL_SUPPORTED)
-      PNG_CONST PNG_pCAL;
+      PNG_pCAL;
 #endif
 #if defined(PNG_READ_pHYs_SUPPORTED)
-      PNG_CONST PNG_pHYs;
+      PNG_pHYs;
 #endif
 #if defined(PNG_READ_sBIT_SUPPORTED)
-      PNG_CONST PNG_sBIT;
+      PNG_sBIT;
 #endif
 #if defined(PNG_READ_sCAL_SUPPORTED)
-      PNG_CONST PNG_sCAL;
+      PNG_sCAL;
 #endif
 #if defined(PNG_READ_sPLT_SUPPORTED)
-      PNG_CONST PNG_sPLT;
+      PNG_sPLT;
 #endif
 #if defined(PNG_READ_sRGB_SUPPORTED)
-      PNG_CONST PNG_sRGB;
+      PNG_sRGB;
 #endif
 #if defined(PNG_READ_tEXt_SUPPORTED)
-      PNG_CONST PNG_tEXt;
+      PNG_tEXt;
 #endif
 #if defined(PNG_READ_tIME_SUPPORTED)
-      PNG_CONST PNG_tIME;
+      PNG_tIME;
 #endif
 #if defined(PNG_READ_tRNS_SUPPORTED)
-      PNG_CONST PNG_tRNS;
+      PNG_tRNS;
 #endif
 #if defined(PNG_READ_zTXt_SUPPORTED)
-      PNG_CONST PNG_zTXt;
+      PNG_zTXt;
 #endif
 #if defined(PNG_READ_APNG_SUPPORTED)
-      PNG_CONST PNG_acTL;
-      PNG_CONST PNG_fcTL;
-      PNG_CONST PNG_fdAT;
+      PNG_acTL;
+      PNG_fcTL;
+      PNG_fdAT;
 #endif
 #endif /* PNG_USE_LOCAL_ARRAYS */
       png_byte chunk_length[4];
       png_uint_32 length;
 
       png_read_data(png_ptr, chunk_length, 4);
       length = png_get_uint_31(png_ptr,chunk_length);
 
@@ -415,17 +417,17 @@ png_read_info(png_structp png_ptr, png_i
       png_crc_read(png_ptr, png_ptr->chunk_name, 4);
 
       png_debug2(0, "Reading %s chunk, length=%lu.\n", png_ptr->chunk_name,
          length);
 
       /* This should be a binary subdivision search or a hash for
        * matching the chunk name rather than a linear search.
        */
-      if (!png_memcmp(png_ptr->chunk_name, png_IDAT, 4))
+      if (!png_memcmp(png_ptr->chunk_name, (png_bytep)png_IDAT, 4))
         if(png_ptr->mode & PNG_AFTER_IDAT)
           png_ptr->mode |= PNG_HAVE_CHUNK_AFTER_IDAT;
 
       if (!png_memcmp(png_ptr->chunk_name, png_IHDR, 4))
          png_handle_IHDR(png_ptr, info_ptr, length);
       else if (!png_memcmp(png_ptr->chunk_name, png_IEND, 4))
          png_handle_IEND(png_ptr, info_ptr, length);
 #ifdef PNG_HANDLE_AS_UNKNOWN_SUPPORTED
@@ -653,24 +655,23 @@ png_start_read_image(png_structp png_ptr
 }
 #endif /* PNG_NO_SEQUENTIAL_READ_SUPPORTED */
 
 #ifndef PNG_NO_SEQUENTIAL_READ_SUPPORTED
 void PNGAPI
 png_read_row(png_structp png_ptr, png_bytep row, png_bytep dsp_row)
 {
 #ifdef PNG_USE_LOCAL_ARRAYS
-   PNG_CONST PNG_IDAT;
+   PNG_IDAT;
 #if defined(PNG_READ_APNG_SUPPORTED)
-   PNG_CONST PNG_fdAT;
-   PNG_CONST PNG_IEND;
+   PNG_fdAT;
+   PNG_IEND;
 #endif
-   PNG_CONST int png_pass_dsp_mask[7] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55,
-     0xff};
-   PNG_CONST int png_pass_mask[7] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
+   const int png_pass_dsp_mask[7] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
+   const int png_pass_mask[7] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 #endif
    int ret;
    if(png_ptr == NULL) return;
    png_debug2(1, "in png_read_row (row %lu, pass %d)\n",
       png_ptr->row_number, png_ptr->pass);
    if (!(png_ptr->flags & PNG_FLAG_ROW_INIT))
       png_read_start_row(png_ptr);
    if (png_ptr->row_number == 0 && png_ptr->pass == 0)
@@ -1046,75 +1047,75 @@ png_read_end(png_structp png_ptr, png_in
 
    png_debug(1, "in png_read_end\n");
    if(png_ptr == NULL) return;
    png_crc_finish(png_ptr, 0); /* Finish off CRC from last IDAT chunk */
 
    do
    {
 #ifdef PNG_USE_LOCAL_ARRAYS
-      PNG_CONST PNG_IHDR;
-      PNG_CONST PNG_IDAT;
-      PNG_CONST PNG_IEND;
-      PNG_CONST PNG_PLTE;
+      PNG_IHDR;
+      PNG_IDAT;
+      PNG_IEND;
+      PNG_PLTE;
 #if defined(PNG_READ_bKGD_SUPPORTED)
-      PNG_CONST PNG_bKGD;
+      PNG_bKGD;
 #endif
 #if defined(PNG_READ_cHRM_SUPPORTED)
-      PNG_CONST PNG_cHRM;
+      PNG_cHRM;
 #endif
 #if defined(PNG_READ_gAMA_SUPPORTED)
-      PNG_CONST PNG_gAMA;
+      PNG_gAMA;
 #endif
 #if defined(PNG_READ_hIST_SUPPORTED)
-      PNG_CONST PNG_hIST;
+      PNG_hIST;
 #endif
 #if defined(PNG_READ_iCCP_SUPPORTED)
-      PNG_CONST PNG_iCCP;
+      PNG_iCCP;
 #endif
 #if defined(PNG_READ_iTXt_SUPPORTED)
-      PNG_CONST PNG_iTXt;
+      PNG_iTXt;
 #endif
 #if defined(PNG_READ_oFFs_SUPPORTED)
-      PNG_CONST PNG_oFFs;
+      PNG_oFFs;
 #endif
 #if defined(PNG_READ_pCAL_SUPPORTED)
-      PNG_CONST PNG_pCAL;
+      PNG_pCAL;
 #endif
 #if defined(PNG_READ_pHYs_SUPPORTED)
-      PNG_CONST PNG_pHYs;
+      PNG_pHYs;
 #endif
 #if defined(PNG_READ_sBIT_SUPPORTED)
-      PNG_CONST PNG_sBIT;
+      PNG_sBIT;
 #endif
 #if defined(PNG_READ_sCAL_SUPPORTED)
-      PNG_CONST PNG_sCAL;
+      PNG_sCAL;
 #endif
 #if defined(PNG_READ_sPLT_SUPPORTED)
-      PNG_CONST PNG_sPLT;
+      PNG_sPLT;
 #endif
 #if defined(PNG_READ_sRGB_SUPPORTED)
-      PNG_CONST PNG_sRGB;
+      PNG_sRGB;
 #endif
 #if defined(PNG_READ_tEXt_SUPPORTED)
-      PNG_CONST PNG_tEXt;
+      PNG_tEXt;
 #endif
 #if defined(PNG_READ_tIME_SUPPORTED)
-      PNG_CONST PNG_tIME;
+      PNG_tIME;
 #endif
 #if defined(PNG_READ_tRNS_SUPPORTED)
-      PNG_CONST PNG_tRNS;
+      PNG_tRNS;
 #endif
 #if defined(PNG_READ_zTXt_SUPPORTED)
-      PNG_CONST PNG_zTXt;
+      PNG_zTXt;
 #endif
 #if defined(PNG_READ_APNG_SUPPORTED)
-      PNG_CONST PNG_acTL;
-      PNG_CONST PNG_fcTL;
-      PNG_CONST PNG_fdAT;
+      PNG_acTL;
+      PNG_fcTL;
+      PNG_fdAT;
 #endif
 #endif /* PNG_USE_LOCAL_ARRAYS */
 
       png_read_data(png_ptr, chunk_length, 4);
       length = png_get_uint_31(png_ptr,chunk_length);
 
       png_reset_crc(png_ptr);
       png_crc_read(png_ptr, png_ptr->chunk_name, 4);
@@ -1598,15 +1599,15 @@ png_read_png(png_structp png_ptr, png_in
    }
 
    png_read_image(png_ptr, info_ptr->row_pointers);
    info_ptr->valid |= PNG_INFO_IDAT;
 
    /* read rest of file, and get additional chunks in info_ptr - REQUIRED */
    png_read_end(png_ptr, info_ptr);
 
-   transforms = transforms; /* quiet compiler warnings */
-   params = params;
+   if(transforms == 0 || params == NULL)
+      /* quiet compiler warnings */ return;
 
 }
 #endif /* PNG_INFO_IMAGE_SUPPORTED */
 #endif /* PNG_NO_SEQUENTIAL_READ_SUPPORTED */
 #endif /* PNG_READ_SUPPORTED */
--- a/modules/libimg/png/pngrtran.c
+++ b/modules/libimg/png/pngrtran.c
@@ -1,12 +1,12 @@
 
 /* pngrtran.c - transforms the data in a row for PNG readers
  *
- * Last changed in libpng 1.2.21 [October 4, 2007]
+ * Last changed in libpng 1.2.15 January 5, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  *
  * This file contains functions optionally called by an application
  * in order to tell libpng how to handle data when reading a PNG.
  * Transformations that are used in both reading and writing are
@@ -543,19 +543,16 @@ png_set_gamma(png_structp png_ptr, doubl
  * to alpha channels.
  */
 void PNGAPI
 png_set_expand(png_structp png_ptr)
 {
    png_debug(1, "in png_set_expand\n");
    if(png_ptr == NULL) return;
    png_ptr->transformations |= (PNG_EXPAND | PNG_EXPAND_tRNS);
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   png_ptr->flags &= ~PNG_FLAG_ROW_INIT;
-#endif
 }
 
 /* GRR 19990627:  the following three functions currently are identical
  *  to png_set_expand().  However, it is entirely reasonable that someone
  *  might wish to expand an indexed image to RGB but *not* expand a single,
  *  fully transparent palette entry to a full alpha channel--perhaps instead
  *  convert tRNS to the grayscale/RGB format (16-bit RGB value), or replace
  *  the transparent color with a particular RGB value, or drop tRNS entirely.
@@ -572,33 +569,26 @@ png_set_expand(png_structp png_ptr)
 
 /* Expand paletted images to RGB. */
 void PNGAPI
 png_set_palette_to_rgb(png_structp png_ptr)
 {
    png_debug(1, "in png_set_palette_to_rgb\n");
    if(png_ptr == NULL) return;
    png_ptr->transformations |= (PNG_EXPAND | PNG_EXPAND_tRNS);
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   png_ptr->flags &= !(PNG_FLAG_ROW_INIT);
-   png_ptr->flags &= ~PNG_FLAG_ROW_INIT;
-#endif
 }
 
 #if !defined(PNG_1_0_X)
 /* Expand grayscale images of less than 8-bit depth to 8 bits. */
 void PNGAPI
 png_set_expand_gray_1_2_4_to_8(png_structp png_ptr)
 {
    png_debug(1, "in png_set_expand_gray_1_2_4_to_8\n");
    if(png_ptr == NULL) return;
-   png_ptr->transformations |= PNG_EXPAND;
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   png_ptr->flags &= ~PNG_FLAG_ROW_INIT;
-#endif
+   png_ptr->transformations |= PNG_EXPAND_tRNS;
 }
 #endif
 
 #if defined(PNG_1_0_X) || defined(PNG_1_2_X)
 /* Expand grayscale images of less than 8-bit depth to 8 bits. */
 /* Deprecated as of libpng-1.2.9 */
 void PNGAPI
 png_set_gray_1_2_4_to_8(png_structp png_ptr)
@@ -609,33 +599,27 @@ png_set_gray_1_2_4_to_8(png_structp png_
 }
 #endif
 
 
 /* Expand tRNS chunks to alpha channels. */
 void PNGAPI
 png_set_tRNS_to_alpha(png_structp png_ptr)
 {
-   png_debug(1, "in png_set_tRNS_to_alpha\n");
+   png_debug(1, "in png_set_expand\n");
    png_ptr->transformations |= (PNG_EXPAND | PNG_EXPAND_tRNS);
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   png_ptr->flags &= ~PNG_FLAG_ROW_INIT;
-#endif
 }
 #endif /* defined(PNG_READ_EXPAND_SUPPORTED) */
 
 #if defined(PNG_READ_GRAY_TO_RGB_SUPPORTED)
 void PNGAPI
 png_set_gray_to_rgb(png_structp png_ptr)
 {
    png_debug(1, "in png_set_gray_to_rgb\n");
    png_ptr->transformations |= PNG_GRAY_TO_RGB;
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   png_ptr->flags &= ~PNG_FLAG_ROW_INIT;
-#endif
 }
 #endif
 
 #if defined(PNG_READ_RGB_TO_GRAY_SUPPORTED)
 #if defined(PNG_FLOATING_POINT_SUPPORTED)
 /* Convert a RGB image to a grayscale of the same width.  This allows us,
  * for example, to convert a 24 bpp RGB image into an 8 bpp grayscale image.
  */
@@ -1255,53 +1239,42 @@ defined(PNG_READ_USER_TRANSFORM_SUPPORTE
 /* Transform the row.  The order of transformations is significant,
  * and is very touchy.  If you add a transformation, take care to
  * decide how it fits in with the other transformations here.
  */
 void /* PRIVATE */
 png_do_read_transformations(png_structp png_ptr)
 {
    png_debug(1, "in png_do_read_transformations\n");
+#if !defined(PNG_USELESS_TESTS_SUPPORTED)
    if (png_ptr->row_buf == NULL)
    {
 #if !defined(PNG_NO_STDIO) && !defined(_WIN32_WCE)
       char msg[50];
 
-      png_snprintf2(msg, 50,
-         "NULL row buffer for row %ld, pass %d", png_ptr->row_number,
+      sprintf(msg, "NULL row buffer for row %ld, pass %d", png_ptr->row_number,
          png_ptr->pass);
       png_error(png_ptr, msg);
 #else
       png_error(png_ptr, "NULL row buffer");
 #endif
    }
-#ifdef PNG_WARN_UNINITIALIZED_ROW
-   if (!(png_ptr->flags & PNG_FLAG_ROW_INIT))
-      /* Application has failed to call either png_read_start_image()
-       * or png_read_update_info() after setting transforms that expand
-       * pixels.  This check added to libpng-1.2.19 */
-#if (PNG_WARN_UNINITIALIZED_ROW==1)
-      png_error(png_ptr, "Uninitialized row");
-#else
-      png_warning(png_ptr, "Uninitialized row");
-#endif
 #endif
 
 #if defined(PNG_READ_EXPAND_SUPPORTED)
    if (png_ptr->transformations & PNG_EXPAND)
    {
       if (png_ptr->row_info.color_type == PNG_COLOR_TYPE_PALETTE)
       {
          png_do_expand_palette(&(png_ptr->row_info), png_ptr->row_buf + 1,
             png_ptr->palette, png_ptr->trans, png_ptr->num_trans);
       }
       else
       {
-         if (png_ptr->num_trans &&
-             (png_ptr->transformations & PNG_EXPAND_tRNS))
+         if (png_ptr->num_trans && (png_ptr->transformations & PNG_EXPAND_tRNS))
             png_do_expand(&(png_ptr->row_info), png_ptr->row_buf + 1,
                &(png_ptr->trans_values));
          else
             png_do_expand(&(png_ptr->row_info), png_ptr->row_buf + 1,
                NULL);
       }
    }
 #endif
@@ -1315,21 +1288,19 @@ png_do_read_transformations(png_structp 
 #if defined(PNG_READ_RGB_TO_GRAY_SUPPORTED)
    if (png_ptr->transformations & PNG_RGB_TO_GRAY)
    {
       int rgb_error =
          png_do_rgb_to_gray(png_ptr, &(png_ptr->row_info), png_ptr->row_buf + 1);
       if(rgb_error)
       {
          png_ptr->rgb_to_gray_status=1;
-         if((png_ptr->transformations & PNG_RGB_TO_GRAY) == 
-             PNG_RGB_TO_GRAY_WARN)
+         if(png_ptr->transformations == PNG_RGB_TO_GRAY_WARN)
             png_warning(png_ptr, "png_do_rgb_to_gray found nongray pixel");
-         if((png_ptr->transformations & PNG_RGB_TO_GRAY) ==
-             PNG_RGB_TO_GRAY_ERR)
+         if(png_ptr->transformations == PNG_RGB_TO_GRAY_ERR)
             png_error(png_ptr, "png_do_rgb_to_gray found nongray pixel");
       }
    }
 #endif
 
 /*
 From Andreas Dilger e-mail to png-implement, 26 March 1998:
 
@@ -3723,17 +3694,17 @@ png_do_expand(png_row_infop row_info, pn
          png_uint_16 gray = (png_uint_16)(trans_value ? trans_value->gray : 0);
 
          if (row_info->bit_depth < 8)
          {
             switch (row_info->bit_depth)
             {
                case 1:
                {
-                  gray = (png_uint_16)((gray&0x01)*0xff);
+                  gray = (png_uint_16)(gray*0xff);
                   sp = row + (png_size_t)((row_width - 1) >> 3);
                   dp = row + (png_size_t)row_width - 1;
                   shift = 7 - (int)((row_width + 7) & 0x07);
                   for (i = 0; i < row_width; i++)
                   {
                      if ((*sp >> shift) & 0x01)
                         *dp = 0xff;
                      else
@@ -3747,17 +3718,17 @@ png_do_expand(png_row_infop row_info, pn
                         shift++;
 
                      dp--;
                   }
                   break;
                }
                case 2:
                {
-                  gray = (png_uint_16)((gray&0x03)*0x55);
+                  gray = (png_uint_16)(gray*0x55);
                   sp = row + (png_size_t)((row_width - 1) >> 2);
                   dp = row + (png_size_t)row_width - 1;
                   shift = (int)((3 - ((row_width + 3) & 0x03)) << 1);
                   for (i = 0; i < row_width; i++)
                   {
                      value = (*sp >> shift) & 0x03;
                      *dp = (png_byte)(value | (value << 2) | (value << 4) |
                         (value << 6));
@@ -3770,17 +3741,17 @@ png_do_expand(png_row_infop row_info, pn
                         shift += 2;
 
                      dp--;
                   }
                   break;
                }
                case 4:
                {
-                  gray = (png_uint_16)((gray&0x0f)*0x11);
+                  gray = (png_uint_16)(gray*0x11);
                   sp = row + (png_size_t)((row_width - 1) >> 1);
                   dp = row + (png_size_t)row_width - 1;
                   shift = (int)((1 - ((row_width + 1) & 0x01)) << 2);
                   for (i = 0; i < row_width; i++)
                   {
                      value = (*sp >> shift) & 0x0f;
                      *dp = (png_byte)(value | (value << 4));
                      if (shift == 4)
@@ -3800,37 +3771,35 @@ png_do_expand(png_row_infop row_info, pn
             row_info->pixel_depth = 8;
             row_info->rowbytes = row_width;
          }
 
          if (trans_value != NULL)
          {
             if (row_info->bit_depth == 8)
             {
-               gray = gray & 0xff;
                sp = row + (png_size_t)row_width - 1;
                dp = row + (png_size_t)(row_width << 1) - 1;
                for (i = 0; i < row_width; i++)
                {
                   if (*sp == gray)
                      *dp-- = 0;
                   else
                      *dp-- = 0xff;
                   *dp-- = *sp--;
                }
             }
             else if (row_info->bit_depth == 16)
             {
-               png_byte gray_high = (gray >> 8) & 0xff;
-               png_byte gray_low = gray & 0xff;
                sp = row + row_info->rowbytes - 1;
                dp = row + (row_info->rowbytes << 1) - 1;
                for (i = 0; i < row_width; i++)
                {
-                  if (*(sp-1) == gray_high && *(sp) == gray_low) 
+                  if (((png_uint_16)*(sp) |
+                     ((png_uint_16)*(sp - 1) << 8)) == gray)
                   {
                      *dp-- = 0;
                      *dp-- = 0;
                   }
                   else
                   {
                      *dp-- = 0xff;
                      *dp-- = 0xff;
@@ -3845,50 +3814,43 @@ png_do_expand(png_row_infop row_info, pn
             row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,
                row_width);
          }
       }
       else if (row_info->color_type == PNG_COLOR_TYPE_RGB && trans_value)
       {
          if (row_info->bit_depth == 8)
          {
-            png_byte red = trans_value->red & 0xff;
-            png_byte green = trans_value->green & 0xff;
-            png_byte blue = trans_value->blue & 0xff;
             sp = row + (png_size_t)row_info->rowbytes - 1;
             dp = row + (png_size_t)(row_width << 2) - 1;
             for (i = 0; i < row_width; i++)
             {
-               if (*(sp - 2) == red && *(sp - 1) == green && *(sp) == blue)
+               if (*(sp - 2) == trans_value->red &&
+                  *(sp - 1) == trans_value->green &&
+                  *(sp - 0) == trans_value->blue)
                   *dp-- = 0;
                else
                   *dp-- = 0xff;
                *dp-- = *sp--;
                *dp-- = *sp--;
                *dp-- = *sp--;
             }
          }
          else if (row_info->bit_depth == 16)
          {
-            png_byte red_high = (trans_value->red >> 8) & 0xff;
-            png_byte green_high = (trans_value->green >> 8) & 0xff;
-            png_byte blue_high = (trans_value->blue >> 8) & 0xff;
-            png_byte red_low = trans_value->red & 0xff;
-            png_byte green_low = trans_value->green & 0xff;
-            png_byte blue_low = trans_value->blue & 0xff;
             sp = row + row_info->rowbytes - 1;
             dp = row + (png_size_t)(row_width << 3) - 1;
             for (i = 0; i < row_width; i++)
             {
-               if (*(sp - 5) == red_high &&
-                  *(sp - 4) == red_low &&
-                  *(sp - 3) == green_high &&
-                  *(sp - 2) == green_low &&
-                  *(sp - 1) == blue_high &&
-                  *(sp    ) == blue_low)
+               if ((((png_uint_16)*(sp - 4) |
+                  ((png_uint_16)*(sp - 5) << 8)) == trans_value->red) &&
+                  (((png_uint_16)*(sp - 2) |
+                  ((png_uint_16)*(sp - 3) << 8)) == trans_value->green) &&
+                  (((png_uint_16)*(sp - 0) |
+                  ((png_uint_16)*(sp - 1) << 8)) == trans_value->blue))
                {
                   *dp-- = 0;
                   *dp-- = 0;
                }
                else
                {
                   *dp-- = 0xff;
                   *dp-- = 0xff;
@@ -3998,17 +3960,17 @@ png_do_dither(png_row_infop row_info, pn
          }
       }
    }
 }
 #endif
 
 #ifdef PNG_FLOATING_POINT_SUPPORTED
 #if defined(PNG_READ_GAMMA_SUPPORTED)
-static PNG_CONST int png_gamma_shift[] =
+static int png_gamma_shift[] =
    {0x10, 0x21, 0x42, 0x84, 0x110, 0x248, 0x550, 0xff0, 0x00};
 
 /* We build the 8- or 16-bit gamma tables here.  Note that for 16-bit
  * tables, we don't make a full table if we are reducing to 8-bit in
  * the future.  Note also how the gamma_16 tables are segmented so that
  * we don't need to allocate > 64K chunks for a full 16-bit table.
  */
 void /* PRIVATE */
--- a/modules/libimg/png/pngrutil.c
+++ b/modules/libimg/png/pngrutil.c
@@ -1,48 +1,44 @@
 
 /* pngrutil.c - utilities to read a PNG file
  *
- * Last changed in libpng 1.2.21 [October 4, 2007]
+ * Last changed in libpng 1.2.15 January 5, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
  *
  * This file contains routines that are only called from within
  * libpng itself during the course of reading an image.
  */
 
 #define PNG_INTERNAL
 #include "png.h"
 
 #if defined(PNG_READ_SUPPORTED)
 
-#if defined(_WIN32_WCE) && (_WIN32_WCE<0x500)
-#  define WIN32_WCE_OLD
-#endif
-
 #ifdef PNG_FLOATING_POINT_SUPPORTED
-#  if defined(WIN32_WCE_OLD)
+#  if defined(_WIN32_WCE)
 /* strtod() function is not supported on WindowsCE */
-__inline double png_strtod(png_structp png_ptr, PNG_CONST char *nptr, char **endptr)
+__inline double png_strtod(png_structp png_ptr, const char *nptr, char **endptr)
 {
    double result = 0;
    int len;
    wchar_t *str, *end;
 
    len = MultiByteToWideChar(CP_ACP, 0, nptr, -1, NULL, 0);
    str = (wchar_t *)png_malloc(png_ptr, len * sizeof(wchar_t));
    if ( NULL != str )
    {
       MultiByteToWideChar(CP_ACP, 0, nptr, -1, str, len);
       result = wcstod(str, &end);
       len = WideCharToMultiByte(CP_ACP, 0, end, -1, NULL, 0, NULL, NULL);
       *endptr = (char *)nptr + (png_strlen(nptr) - len + 1);
-      png_free(png_ptr, str);
+      png_free(str);
    }
    return result;
 }
 #  else
 #    define png_strtod(p,a,b) strtod(a,b)
 #  endif
 #endif
 
@@ -180,17 +176,17 @@ png_crc_error(png_structp png_ptr)
  * holding the original prefix part and an uncompressed version of the
  * trailing part (the malloc area passed in is freed).
  */
 png_charp /* PRIVATE */
 png_decompress_chunk(png_structp png_ptr, int comp_type,
                               png_charp chunkdata, png_size_t chunklength,
                               png_size_t prefix_size, png_size_t *newlength)
 {
-   static PNG_CONST char msg[] = "Error decoding compressed text";
+   static char msg[] = "Error decoding compressed text";
    png_charp text;
    png_size_t text_size;
 
    if (comp_type == PNG_COMPRESSION_TYPE_BASE)
    {
       int ret = Z_OK;
       png_ptr->zstream.next_in = (png_bytep)(chunkdata + prefix_size);
       png_ptr->zstream.avail_in = (uInt)(chunklength - prefix_size);
@@ -281,26 +277,23 @@ png_decompress_chunk(png_structp png_ptr
          }