security/nss/lib/freebl/mpi/mpcpucache.c
author Norisz Fay <nfay@mozilla.com>
Mon, 04 Jul 2022 00:36:16 +0300
changeset 622862 c285f5ebde4b0e01dc841bfa8a948dd81372d15b
parent 620870 8a140b71769579902156914a69ba4849c5598db3
permissions -rw-r--r--
Merge autoland to mozilla-central a=merge

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mpi.h"
#include "prtypes.h"

/*
 * This file implements a single function: s_mpi_getProcessorLineSize();
 * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line
 * if a cache exists, or zero if there is no cache. If more than one
 * cache line exists, it should return the smallest line size (which is
 * usually the L1 cache).
 *
 * mp_modexp uses this information to make sure that private key information
 * isn't being leaked through the cache.
 *
 * Currently the file returns good data for most modern x86 processors, and
 * reasonable data on 64-bit ppc processors. All other processors are assumed
 * to have a cache line size of 32 bytes.
 *
 */

#if defined(i386) || defined(__i386) || defined(__X86__) || defined(_M_IX86) || defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
/* X86 processors have special instructions that tell us about the cache */
#include "string.h"

#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
#define AMD_64 1
#endif

/* Generic CPUID function */
#if defined(AMD_64)

#if defined(__GNUC__)

void
freebl_cpuid(unsigned long op, unsigned long *eax,
             unsigned long *ebx, unsigned long *ecx,
             unsigned long *edx)
{
    __asm__("xor %%ecx, %%ecx\n\t"
            "cpuid\n\t"
            : "=a"(*eax),
              "=b"(*ebx),
              "=c"(*ecx),
              "=d"(*edx)
            : "0"(op));
}

#elif defined(_MSC_VER)

#include <intrin.h>

void
freebl_cpuid(unsigned long op, unsigned long *eax,
             unsigned long *ebx, unsigned long *ecx,
             unsigned long *edx)
{
    int intrinsic_out[4];

    __cpuid(intrinsic_out, op);
    *eax = intrinsic_out[0];
    *ebx = intrinsic_out[1];
    *ecx = intrinsic_out[2];
    *edx = intrinsic_out[3];
}

#endif

#else /* !defined(AMD_64) */

/* x86 */

#if defined(__GNUC__)
void
freebl_cpuid(unsigned long op, unsigned long *eax,
             unsigned long *ebx, unsigned long *ecx,
             unsigned long *edx)
{
    /* Some older processors don't fill the ecx register with cpuid, so clobber it
     * before calling cpuid, so that there's no risk of picking random bits that
     * erroneously indicate that absent CPU features are present.
     * Also, GCC isn't smart enough to save the ebx PIC register on its own
     * in this case, so do it by hand. Use edi to store ebx and pass the
     * value returned in ebx from cpuid through edi. */
    __asm__("xor %%ecx, %%ecx\n\t"
            "mov %%ebx,%%edi\n\t"
            "cpuid\n\t"
            "xchgl %%ebx,%%edi\n\t"
            : "=a"(*eax),
              "=D"(*ebx),
              "=c"(*ecx),
              "=d"(*edx)
            : "0"(op));
}

/*
 * try flipping a processor flag to determine CPU type
 */
static unsigned long
changeFlag(unsigned long flag)
{
    unsigned long changedFlags, originalFlags;
    __asm__("pushfl\n\t" /* get the flags */
            "popl %0\n\t"
            "movl %0,%1\n\t" /* save the original flags */
            "xorl %2,%0\n\t" /* flip the bit */
            "pushl %0\n\t"   /* set the flags */
            "popfl\n\t"
            "pushfl\n\t" /* get the flags again (for return) */
            "popl %0\n\t"
            "pushl %1\n\t" /* restore the original flags */
            "popfl\n\t"
            : "=r"(changedFlags),
              "=r"(originalFlags),
              "=r"(flag)
            : "2"(flag));
    return changedFlags ^ originalFlags;
}

#elif defined(_MSC_VER)

/*
 * windows versions of the above assembler
 */
#define wcpuid __asm __emit 0fh __asm __emit 0a2h
void
freebl_cpuid(unsigned long op, unsigned long *Reax,
             unsigned long *Rebx, unsigned long *Recx, unsigned long *Redx)
{
    unsigned long Leax, Lebx, Lecx, Ledx;
    __asm {
        pushad
        xor     ecx,ecx
        mov     eax,op
        wcpuid
        mov     Leax,eax
        mov     Lebx,ebx
        mov     Lecx,ecx
        mov     Ledx,edx
        popad
    }
    *Reax = Leax;
    *Rebx = Lebx;
    *Recx = Lecx;
    *Redx = Ledx;
}

static unsigned long
changeFlag(unsigned long flag)
{
    unsigned long changedFlags, originalFlags;
    __asm {
        push eax
        push ebx
        pushfd /* get the flags */
            pop  eax
        push eax /* save the flags on the stack */
            mov  originalFlags,eax /* save the original flags */
        mov  ebx,flag
            xor  eax,ebx /* flip the bit */
        push eax /* set the flags */
            popfd
        pushfd /* get the flags again (for return) */
        pop  eax
        popfd /* restore the original flags */
        mov changedFlags,eax
        pop ebx
        pop eax
    }
    return changedFlags ^ originalFlags;
}
#endif

#endif

#if !defined(AMD_64)
#define AC_FLAG 0x40000
#define ID_FLAG 0x200000

/* 386 processors can't flip the AC_FLAG, intel AP Note AP-485 */
static int
is386()
{
    return changeFlag(AC_FLAG) == 0;
}

/* 486 processors can't flip the ID_FLAG, intel AP Note AP-485 */
static int
is486()
{
    return changeFlag(ID_FLAG) == 0;
}
#endif

/*
 * table for Intel Cache.
 * See Intel Application Note AP-485 for more information
 */

typedef unsigned char CacheTypeEntry;

typedef enum {
    Cache_NONE = 0,
    Cache_UNKNOWN = 1,
    Cache_TLB = 2,
    Cache_TLBi = 3,
    Cache_TLBd = 4,
    Cache_Trace = 5,
    Cache_L1 = 6,
    Cache_L1i = 7,
    Cache_L1d = 8,
    Cache_L2 = 9,
    Cache_L2i = 10,
    Cache_L2d = 11,
    Cache_L3 = 12,
    Cache_L3i = 13,
    Cache_L3d = 14
} CacheType;

struct _cache {
    CacheTypeEntry type;
    unsigned char lineSize;
};
static const struct _cache CacheMap[256] = {
    /* 00 */ { Cache_NONE, 0 },
    /* 01 */ { Cache_TLBi, 0 },
    /* 02 */ { Cache_TLBi, 0 },
    /* 03 */ { Cache_TLBd, 0 },
    /* 04 */ {
        Cache_TLBd,
    },
    /* 05 */ { Cache_UNKNOWN, 0 },
    /* 06 */ { Cache_L1i, 32 },
    /* 07 */ { Cache_UNKNOWN, 0 },
    /* 08 */ { Cache_L1i, 32 },
    /* 09 */ { Cache_UNKNOWN, 0 },
    /* 0a */ { Cache_L1d, 32 },
    /* 0b */ { Cache_UNKNOWN, 0 },
    /* 0c */ { Cache_L1d, 32 },
    /* 0d */ { Cache_UNKNOWN, 0 },
    /* 0e */ { Cache_UNKNOWN, 0 },
    /* 0f */ { Cache_UNKNOWN, 0 },
    /* 10 */ { Cache_UNKNOWN, 0 },
    /* 11 */ { Cache_UNKNOWN, 0 },
    /* 12 */ { Cache_UNKNOWN, 0 },
    /* 13 */ { Cache_UNKNOWN, 0 },
    /* 14 */ { Cache_UNKNOWN, 0 },
    /* 15 */ { Cache_UNKNOWN, 0 },
    /* 16 */ { Cache_UNKNOWN, 0 },
    /* 17 */ { Cache_UNKNOWN, 0 },
    /* 18 */ { Cache_UNKNOWN, 0 },
    /* 19 */ { Cache_UNKNOWN, 0 },
    /* 1a */ { Cache_UNKNOWN, 0 },
    /* 1b */ { Cache_UNKNOWN, 0 },
    /* 1c */ { Cache_UNKNOWN, 0 },
    /* 1d */ { Cache_UNKNOWN, 0 },
    /* 1e */ { Cache_UNKNOWN, 0 },
    /* 1f */ { Cache_UNKNOWN, 0 },
    /* 20 */ { Cache_UNKNOWN, 0 },
    /* 21 */ { Cache_UNKNOWN, 0 },
    /* 22 */ { Cache_L3, 64 },
    /* 23 */ { Cache_L3, 64 },
    /* 24 */ { Cache_UNKNOWN, 0 },
    /* 25 */ { Cache_L3, 64 },
    /* 26 */ { Cache_UNKNOWN, 0 },
    /* 27 */ { Cache_UNKNOWN, 0 },
    /* 28 */ { Cache_UNKNOWN, 0 },
    /* 29 */ { Cache_L3, 64 },
    /* 2a */ { Cache_UNKNOWN, 0 },
    /* 2b */ { Cache_UNKNOWN, 0 },
    /* 2c */ { Cache_L1d, 64 },
    /* 2d */ { Cache_UNKNOWN, 0 },
    /* 2e */ { Cache_UNKNOWN, 0 },
    /* 2f */ { Cache_UNKNOWN, 0 },
    /* 30 */ { Cache_L1i, 64 },
    /* 31 */ { Cache_UNKNOWN, 0 },
    /* 32 */ { Cache_UNKNOWN, 0 },
    /* 33 */ { Cache_UNKNOWN, 0 },
    /* 34 */ { Cache_UNKNOWN, 0 },
    /* 35 */ { Cache_UNKNOWN, 0 },
    /* 36 */ { Cache_UNKNOWN, 0 },
    /* 37 */ { Cache_UNKNOWN, 0 },
    /* 38 */ { Cache_UNKNOWN, 0 },
    /* 39 */ { Cache_L2, 64 },
    /* 3a */ { Cache_UNKNOWN, 0 },
    /* 3b */ { Cache_L2, 64 },
    /* 3c */ { Cache_L2, 64 },
    /* 3d */ { Cache_UNKNOWN, 0 },
    /* 3e */ { Cache_UNKNOWN, 0 },
    /* 3f */ { Cache_UNKNOWN, 0 },
    /* 40 */ { Cache_L2, 0 },
    /* 41 */ { Cache_L2, 32 },
    /* 42 */ { Cache_L2, 32 },
    /* 43 */ { Cache_L2, 32 },
    /* 44 */ { Cache_L2, 32 },
    /* 45 */ { Cache_L2, 32 },
    /* 46 */ { Cache_UNKNOWN, 0 },
    /* 47 */ { Cache_UNKNOWN, 0 },
    /* 48 */ { Cache_UNKNOWN, 0 },
    /* 49 */ { Cache_UNKNOWN, 0 },
    /* 4a */ { Cache_UNKNOWN, 0 },
    /* 4b */ { Cache_UNKNOWN, 0 },
    /* 4c */ { Cache_UNKNOWN, 0 },
    /* 4d */ { Cache_UNKNOWN, 0 },
    /* 4e */ { Cache_UNKNOWN, 0 },
    /* 4f */ { Cache_UNKNOWN, 0 },
    /* 50 */ { Cache_TLBi, 0 },
    /* 51 */ { Cache_TLBi, 0 },
    /* 52 */ { Cache_TLBi, 0 },
    /* 53 */ { Cache_UNKNOWN, 0 },
    /* 54 */ { Cache_UNKNOWN, 0 },
    /* 55 */ { Cache_UNKNOWN, 0 },
    /* 56 */ { Cache_UNKNOWN, 0 },
    /* 57 */ { Cache_UNKNOWN, 0 },
    /* 58 */ { Cache_UNKNOWN, 0 },
    /* 59 */ { Cache_UNKNOWN, 0 },
    /* 5a */ { Cache_UNKNOWN, 0 },
    /* 5b */ { Cache_TLBd, 0 },
    /* 5c */ { Cache_TLBd, 0 },
    /* 5d */ { Cache_TLBd, 0 },
    /* 5e */ { Cache_UNKNOWN, 0 },
    /* 5f */ { Cache_UNKNOWN, 0 },
    /* 60 */ { Cache_UNKNOWN, 0 },
    /* 61 */ { Cache_UNKNOWN, 0 },
    /* 62 */ { Cache_UNKNOWN, 0 },
    /* 63 */ { Cache_UNKNOWN, 0 },
    /* 64 */ { Cache_UNKNOWN, 0 },
    /* 65 */ { Cache_UNKNOWN, 0 },
    /* 66 */ { Cache_L1d, 64 },
    /* 67 */ { Cache_L1d, 64 },
    /* 68 */ { Cache_L1d, 64 },
    /* 69 */ { Cache_UNKNOWN, 0 },
    /* 6a */ { Cache_UNKNOWN, 0 },
    /* 6b */ { Cache_UNKNOWN, 0 },
    /* 6c */ { Cache_UNKNOWN, 0 },
    /* 6d */ { Cache_UNKNOWN, 0 },
    /* 6e */ { Cache_UNKNOWN, 0 },
    /* 6f */ { Cache_UNKNOWN, 0 },
    /* 70 */ { Cache_Trace, 1 },
    /* 71 */ { Cache_Trace, 1 },
    /* 72 */ { Cache_Trace, 1 },
    /* 73 */ { Cache_UNKNOWN, 0 },
    /* 74 */ { Cache_UNKNOWN, 0 },
    /* 75 */ { Cache_UNKNOWN, 0 },
    /* 76 */ { Cache_UNKNOWN, 0 },
    /* 77 */ { Cache_UNKNOWN, 0 },
    /* 78 */ { Cache_UNKNOWN, 0 },
    /* 79 */ { Cache_L2, 64 },
    /* 7a */ { Cache_L2, 64 },
    /* 7b */ { Cache_L2, 64 },
    /* 7c */ { Cache_L2, 64 },
    /* 7d */ { Cache_UNKNOWN, 0 },
    /* 7e */ { Cache_UNKNOWN, 0 },
    /* 7f */ { Cache_UNKNOWN, 0 },
    /* 80 */ { Cache_UNKNOWN, 0 },
    /* 81 */ { Cache_UNKNOWN, 0 },
    /* 82 */ { Cache_L2, 32 },
    /* 83 */ { Cache_L2, 32 },
    /* 84 */ { Cache_L2, 32 },
    /* 85 */ { Cache_L2, 32 },
    /* 86 */ { Cache_L2, 64 },
    /* 87 */ { Cache_L2, 64 },
    /* 88 */ { Cache_UNKNOWN, 0 },
    /* 89 */ { Cache_UNKNOWN, 0 },
    /* 8a */ { Cache_UNKNOWN, 0 },
    /* 8b */ { Cache_UNKNOWN, 0 },
    /* 8c */ { Cache_UNKNOWN, 0 },
    /* 8d */ { Cache_UNKNOWN, 0 },
    /* 8e */ { Cache_UNKNOWN, 0 },
    /* 8f */ { Cache_UNKNOWN, 0 },
    /* 90 */ { Cache_UNKNOWN, 0 },
    /* 91 */ { Cache_UNKNOWN, 0 },
    /* 92 */ { Cache_UNKNOWN, 0 },
    /* 93 */ { Cache_UNKNOWN, 0 },
    /* 94 */ { Cache_UNKNOWN, 0 },
    /* 95 */ { Cache_UNKNOWN, 0 },
    /* 96 */ { Cache_UNKNOWN, 0 },
    /* 97 */ { Cache_UNKNOWN, 0 },
    /* 98 */ { Cache_UNKNOWN, 0 },
    /* 99 */ { Cache_UNKNOWN, 0 },
    /* 9a */ { Cache_UNKNOWN, 0 },
    /* 9b */ { Cache_UNKNOWN, 0 },
    /* 9c */ { Cache_UNKNOWN, 0 },
    /* 9d */ { Cache_UNKNOWN, 0 },
    /* 9e */ { Cache_UNKNOWN, 0 },
    /* 9f */ { Cache_UNKNOWN, 0 },
    /* a0 */ { Cache_UNKNOWN, 0 },
    /* a1 */ { Cache_UNKNOWN, 0 },
    /* a2 */ { Cache_UNKNOWN, 0 },
    /* a3 */ { Cache_UNKNOWN, 0 },
    /* a4 */ { Cache_UNKNOWN, 0 },
    /* a5 */ { Cache_UNKNOWN, 0 },
    /* a6 */ { Cache_UNKNOWN, 0 },
    /* a7 */ { Cache_UNKNOWN, 0 },
    /* a8 */ { Cache_UNKNOWN, 0 },
    /* a9 */ { Cache_UNKNOWN, 0 },
    /* aa */ { Cache_UNKNOWN, 0 },
    /* ab */ { Cache_UNKNOWN, 0 },
    /* ac */ { Cache_UNKNOWN, 0 },
    /* ad */ { Cache_UNKNOWN, 0 },
    /* ae */ { Cache_UNKNOWN, 0 },
    /* af */ { Cache_UNKNOWN, 0 },
    /* b0 */ { Cache_TLBi, 0 },
    /* b1 */ { Cache_UNKNOWN, 0 },
    /* b2 */ { Cache_UNKNOWN, 0 },
    /* b3 */ { Cache_TLBd, 0 },
    /* b4 */ { Cache_UNKNOWN, 0 },
    /* b5 */ { Cache_UNKNOWN, 0 },
    /* b6 */ { Cache_UNKNOWN, 0 },
    /* b7 */ { Cache_UNKNOWN, 0 },
    /* b8 */ { Cache_UNKNOWN, 0 },
    /* b9 */ { Cache_UNKNOWN, 0 },
    /* ba */ { Cache_UNKNOWN, 0 },
    /* bb */ { Cache_UNKNOWN, 0 },
    /* bc */ { Cache_UNKNOWN, 0 },
    /* bd */ { Cache_UNKNOWN, 0 },
    /* be */ { Cache_UNKNOWN, 0 },
    /* bf */ { Cache_UNKNOWN, 0 },
    /* c0 */ { Cache_UNKNOWN, 0 },
    /* c1 */ { Cache_UNKNOWN, 0 },
    /* c2 */ { Cache_UNKNOWN, 0 },
    /* c3 */ { Cache_UNKNOWN, 0 },
    /* c4 */ { Cache_UNKNOWN, 0 },
    /* c5 */ { Cache_UNKNOWN, 0 },
    /* c6 */ { Cache_UNKNOWN, 0 },
    /* c7 */ { Cache_UNKNOWN, 0 },
    /* c8 */ { Cache_UNKNOWN, 0 },
    /* c9 */ { Cache_UNKNOWN, 0 },
    /* ca */ { Cache_UNKNOWN, 0 },
    /* cb */ { Cache_UNKNOWN, 0 },
    /* cc */ { Cache_UNKNOWN, 0 },
    /* cd */ { Cache_UNKNOWN, 0 },
    /* ce */ { Cache_UNKNOWN, 0 },
    /* cf */ { Cache_UNKNOWN, 0 },
    /* d0 */ { Cache_UNKNOWN, 0 },
    /* d1 */ { Cache_UNKNOWN, 0 },
    /* d2 */ { Cache_UNKNOWN, 0 },
    /* d3 */ { Cache_UNKNOWN, 0 },
    /* d4 */ { Cache_UNKNOWN, 0 },
    /* d5 */ { Cache_UNKNOWN, 0 },
    /* d6 */ { Cache_UNKNOWN, 0 },
    /* d7 */ { Cache_UNKNOWN, 0 },
    /* d8 */ { Cache_UNKNOWN, 0 },
    /* d9 */ { Cache_UNKNOWN, 0 },
    /* da */ { Cache_UNKNOWN, 0 },
    /* db */ { Cache_UNKNOWN, 0 },
    /* dc */ { Cache_UNKNOWN, 0 },
    /* dd */ { Cache_UNKNOWN, 0 },
    /* de */ { Cache_UNKNOWN, 0 },
    /* df */ { Cache_UNKNOWN, 0 },
    /* e0 */ { Cache_UNKNOWN, 0 },
    /* e1 */ { Cache_UNKNOWN, 0 },
    /* e2 */ { Cache_UNKNOWN, 0 },
    /* e3 */ { Cache_UNKNOWN, 0 },
    /* e4 */ { Cache_UNKNOWN, 0 },
    /* e5 */ { Cache_UNKNOWN, 0 },
    /* e6 */ { Cache_UNKNOWN, 0 },
    /* e7 */ { Cache_UNKNOWN, 0 },
    /* e8 */ { Cache_UNKNOWN, 0 },
    /* e9 */ { Cache_UNKNOWN, 0 },
    /* ea */ { Cache_UNKNOWN, 0 },
    /* eb */ { Cache_UNKNOWN, 0 },
    /* ec */ { Cache_UNKNOWN, 0 },
    /* ed */ { Cache_UNKNOWN, 0 },
    /* ee */ { Cache_UNKNOWN, 0 },
    /* ef */ { Cache_UNKNOWN, 0 },
    /* f0 */ { Cache_UNKNOWN, 0 },
    /* f1 */ { Cache_UNKNOWN, 0 },
    /* f2 */ { Cache_UNKNOWN, 0 },
    /* f3 */ { Cache_UNKNOWN, 0 },
    /* f4 */ { Cache_UNKNOWN, 0 },
    /* f5 */ { Cache_UNKNOWN, 0 },
    /* f6 */ { Cache_UNKNOWN, 0 },
    /* f7 */ { Cache_UNKNOWN, 0 },
    /* f8 */ { Cache_UNKNOWN, 0 },
    /* f9 */ { Cache_UNKNOWN, 0 },
    /* fa */ { Cache_UNKNOWN, 0 },
    /* fb */ { Cache_UNKNOWN, 0 },
    /* fc */ { Cache_UNKNOWN, 0 },
    /* fd */ { Cache_UNKNOWN, 0 },
    /* fe */ { Cache_UNKNOWN, 0 },
    /* ff */ { Cache_UNKNOWN, 0 }
};

/*
 * use the above table to determine the CacheEntryLineSize.
 */
static void
getIntelCacheEntryLineSize(unsigned long val, int *level,
                           unsigned long *lineSize)
{
    CacheType type;

    type = CacheMap[val].type;
    /* only interested in data caches */
    /* NOTE val = 0x40 is a special value that means no L2 or L3 cache.
     * this data check has the side effect of rejecting that entry. If
     * that wasn't the case, we could have to reject it explicitly */
    if (CacheMap[val].lineSize == 0) {
        return;
    }
    /* look at the caches, skip types we aren't interested in.
     * if we already have a value for a lower level cache, skip the
     * current entry */
    if ((type == Cache_L1) || (type == Cache_L1d)) {
        *level = 1;
        *lineSize = CacheMap[val].lineSize;
    } else if ((*level >= 2) && ((type == Cache_L2) || (type == Cache_L2d))) {
        *level = 2;
        *lineSize = CacheMap[val].lineSize;
    } else if ((*level >= 3) && ((type == Cache_L3) || (type == Cache_L3d))) {
        *level = 3;
        *lineSize = CacheMap[val].lineSize;
    }
    return;
}

static void
getIntelRegisterCacheLineSize(unsigned long val,
                              int *level, unsigned long *lineSize)
{
    getIntelCacheEntryLineSize(val >> 24 & 0xff, level, lineSize);
    getIntelCacheEntryLineSize(val >> 16 & 0xff, level, lineSize);
    getIntelCacheEntryLineSize(val >> 8 & 0xff, level, lineSize);
    getIntelCacheEntryLineSize(val & 0xff, level, lineSize);
}

/*
 * returns '0' if no recognized cache is found, or if the cache
 * information is supported by this processor
 */
static unsigned long
getIntelCacheLineSize(int cpuidLevel)
{
    int level = 4;
    unsigned long lineSize = 0;
    unsigned long eax, ebx, ecx, edx;
    int repeat, count;

    if (cpuidLevel < 2) {
        return 0;
    }

    /* command '2' of the cpuid is intel's cache info call. Each byte of the
     * 4 registers contain a potential descriptor for the cache. The CacheMap
     * table maps the cache entry with the processor cache. Register 'al'
     * contains a count value that cpuid '2' needs to be called in order to
     * find all the cache descriptors. Only registers with the high bit set
     * to 'zero' have valid descriptors. This code loops through all the
     * required calls to cpuid '2' and passes any valid descriptors it finds
     * to the getIntelRegisterCacheLineSize code, which breaks the registers
     * down into their component descriptors. In the end the lineSize of the
     * lowest level cache data cache is returned. */
    freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
    repeat = eax & 0xf;
    for (count = 0; count < repeat; count++) {
        if ((eax & 0x80000000) == 0) {
            getIntelRegisterCacheLineSize(eax & 0xffffff00, &level, &lineSize);
        }
        if ((ebx & 0x80000000) == 0) {
            getIntelRegisterCacheLineSize(ebx, &level, &lineSize);
        }
        if ((ecx & 0x80000000) == 0) {
            getIntelRegisterCacheLineSize(ecx, &level, &lineSize);
        }
        if ((edx & 0x80000000) == 0) {
            getIntelRegisterCacheLineSize(edx, &level, &lineSize);
        }
        if (count + 1 != repeat) {
            freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
        }
    }
    return lineSize;
}

/*
 * returns '0' if the cache info is not supported by this processor.
 * This is based on the AMD extended cache commands for cpuid.
 * (see "AMD Processor Recognition Application Note" Publication 20734).
 * Some other processors use the identical scheme.
 * (see "Processor Recognition, Transmeta Corporation").
 */
static unsigned long
getOtherCacheLineSize(unsigned long cpuidLevel)
{
    unsigned long lineSize = 0;
    unsigned long eax, ebx, ecx, edx;

    /* get the Extended CPUID level */
    freebl_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
    cpuidLevel = eax;

    if (cpuidLevel >= 0x80000005) {
        freebl_cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
        lineSize = ecx & 0xff; /* line Size, L1 Data Cache */
    }
    return lineSize;
}

static const char *const manMap[] = {
#define INTEL 0
    "GenuineIntel",
#define AMD 1
    "AuthenticAMD",
#define CYRIX 2
    "CyrixInstead",
#define CENTAUR 2
    "CentaurHauls",
#define NEXGEN 3
    "NexGenDriven",
#define TRANSMETA 4
    "GenuineTMx86",
#define RISE 5
    "RiseRiseRise",
#define UMC 6
    "UMC UMC UMC ",
#define SIS 7
    "Sis Sis Sis ",
#define NATIONAL 8
    "Geode by NSC",
};

static const int n_manufacturers = sizeof(manMap) / sizeof(manMap[0]);

#define MAN_UNKNOWN 9

#if !defined(AMD_64)
#define SSE2_FLAG (1 << 26)
unsigned long
s_mpi_is_sse2()
{
    unsigned long eax, ebx, ecx, edx;

    if (is386() || is486()) {
        return 0;
    }
    freebl_cpuid(0, &eax, &ebx, &ecx, &edx);

    /* has no SSE2 extensions */
    if (eax == 0) {
        return 0;
    }

    freebl_cpuid(1, &eax, &ebx, &ecx, &edx);
    return (edx & SSE2_FLAG) == SSE2_FLAG;
}
#endif

unsigned long
s_mpi_getProcessorLineSize()
{
    unsigned long eax, ebx, ecx, edx;
    PRUint32 cpuid[3];
    unsigned long cpuidLevel;
    unsigned long cacheLineSize = 0;
    int manufacturer = MAN_UNKNOWN;
    int i;
    char string[13];

#if !defined(AMD_64)
    if (is386()) {
        return 0; /* 386 had no cache */
    }
    if (is486()) {
        return 32; /* really? need more info */
    }
#endif

    /* Pentium, cpuid command is available */
    freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
    cpuidLevel = eax;
    /* string holds the CPU's manufacturer ID string - a twelve
     * character ASCII string stored in ebx, edx, ecx, and
     * the 32-bit extended feature flags are in edx, ecx.
     */
    cpuid[0] = ebx;
    cpuid[1] = ecx;
    cpuid[2] = edx;
    memcpy(string, cpuid, sizeof(cpuid));
    string[12] = 0;

    manufacturer = MAN_UNKNOWN;
    for (i = 0; i < n_manufacturers; i++) {
        if (strcmp(manMap[i], string) == 0) {
            manufacturer = i;
        }
    }

    if (manufacturer == INTEL) {
        cacheLineSize = getIntelCacheLineSize(cpuidLevel);
    } else {
        cacheLineSize = getOtherCacheLineSize(cpuidLevel);
    }
    /* doesn't support cache info based on cpuid. This means
     * an old pentium class processor, which have cache lines of
     * 32. If we learn differently, we can use a switch based on
     * the Manufacturer id  */
    if (cacheLineSize == 0) {
        cacheLineSize = 32;
    }
    return cacheLineSize;
}
#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
#endif

#if defined(__ppc64__)
/*
 *  Sigh, The PPC has some really nice features to help us determine cache
 *  size, since it had lots of direct control functions to do so. The POWER
 *  processor even has an instruction to do this, but it was dropped in
 *  PowerPC. Unfortunately most of them are not available in user mode.
 *
 *  The dcbz function would be a great way to determine cache line size except
 *  1) it only works on write-back memory (it throws an exception otherwise),
 *  and 2) because so many mac programs 'knew' the processor cache size was
 *  32 bytes, they used this instruction as a fast 'zero 32 bytes'. Now the new
 *  G5 processor has 128 byte cache, but dcbz only clears 32 bytes to keep
 *  these programs happy. dcbzl work if 64 bit instructions are supported.
 *  If you know 64 bit instructions are supported, and that stack is
 *  write-back, you can use this code.
 */
#include "memory.h"

/* clear the cache line that contains 'array' */
static inline void
dcbzl(char *array)
{
    __asm__("dcbzl %0, %1"
            : /*no result*/
            : "b%"(array), "r"(0)
            : "memory");
}

#define PPC_DO_ALIGN(x, y) ((char *)((((long long)(x)) + ((y)-1)) & ~((y)-1)))

#define PPC_MAX_LINE_SIZE 256
unsigned long
s_mpi_getProcessorLineSize()
{
    char testArray[2 * PPC_MAX_LINE_SIZE + 1];
    char *test;
    int i;

    /* align the array on a maximum line size boundary, so we
     * know we are starting to clear from the first address */
    test = PPC_DO_ALIGN(testArray, PPC_MAX_LINE_SIZE);
    /* set all the values to 1's */
    memset(test, 0xff, PPC_MAX_LINE_SIZE);
    /* clear one cache block starting at 'test' */
    dcbzl(test);

    /* find the size of the cleared area, that's our block size */
    for (i = PPC_MAX_LINE_SIZE; i != 0; i = i / 2) {
        if (test[i - 1] == 0) {
            return i;
        }
    }
    return 0;
}

#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
#endif

/*
 * put other processor and platform specific cache code here
 * return the smallest cache line size in bytes on the processor
 * (usually the L1 cache). If the OS has a call, this would be
 * a greate place to put it.
 *
 * If there is no cache, return 0;
 *
 * define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED so the generic functions
 * below aren't compiled.
 *
 */

/* If no way to get the processor cache line size has been defined, assume
 * it's 32 bytes (most common value, does not significantly impact performance)
 */
#ifndef MPI_GET_PROCESSOR_LINE_SIZE_DEFINED
unsigned long
s_mpi_getProcessorLineSize()
{
    return 32;
}
#endif