/*==========================================================================;
 *
 *  Copyright (C) Microsoft Corporation.  All Rights Reserved.
 *
 *  File:       d3d9gpu.h
 *  Content:    Direct3D hardware register definitions include file
 *
 ****************************************************************************/

#ifndef _D3D9GPU_H_
#define _D3D9GPU_H_

#ifdef __cplusplus
extern "C" {
#endif

#pragma warning(push)

// Disable nameless struct/union and zero-sized array warnings for this header.

#pragma warning(disable:4201 4200)

// The PPC back-end of the C compiler by default defines bitfields to be
// ordered from the MSB to the LSB, which is opposite the convention on
// the X86 platform.  Use the 'bitfield_order' pragma to switch the
// ordering.  Note that this does not affect endianness in any way.

#if defined(_M_PPCBE)
#pragma bitfield_order(push)
#pragma bitfield_order(lsb_to_msb)
#endif

// Total amount of EDRAM memory, in bytes:

#define GPU_EDRAM_SIZE                              (10*1024*1024)

// Total amount of system memory addressable by GPU, in bytes:

#define GPU_MEMORY_SIZE                             (512*1024*1024)

// The GPU's core clock is 500 MHz:

#define GPU_CLOCK_SPEED                             (500*1000*1000)

// AND with this value as part of the calculation to convert from a
// virtual memory pointer to a physical address that can be used by the GPU:

#define GPU_ADDRESS_MASK                            (GPU_MEMORY_SIZE - 1)

// The CPU's virtual addresses for 'physical memory' allocations are divided
// into three sections:
//
// 0xA0000000 - 0xBFFFFFF 64 KB Pages
// 0xC0000000 - 0xDFFFFFF 16 MB Pages (Address range also used for cached-read-only pages, see below)
// 0xE0000000 - 0xFFFFFFF  4 KB Pages

#define GPU_CPU_VIRTUAL_ADDRESS_64KB_START          ((DWORD) 0xA0000000)
#define GPU_CPU_VIRTUAL_ADDRESS_64KB_END            ((DWORD) 0xBFFFFFFF)
#define GPU_CPU_VIRTUAL_ADDRESS_16MB_START          ((DWORD) 0xC0000000)
#define GPU_CPU_VIRTUAL_ADDRESS_16MB_END            ((DWORD) 0xDFFFFFFF)
#define GPU_CPU_VIRTUAL_ADDRESS_4KB_START           ((DWORD) 0xDFFFF000)
#define GPU_CPU_VIRTUAL_ADDRESS_4KB_END             ((DWORD) 0xFFFFFFFF)

#define GPU_CPU_VIRTUAL_ADDRESS                     ((DWORD) 0xA0000000)

// The 64KB and 4 KB physical memory pages can be accessed as
// cached, read-only memory by offseting their addresses from their
// native range to the corresponding 0xC0000000 to 0xDFFFFFFF range.
// 16 MB pages are always write-combined. They don't have a corresponding
// cached, read-only representation.

#define GPU_CPU_CACHED_READONLY_VIRTUAL_ADDRESS     ((DWORD) 0xC0000000)

// There are sixteen 32-bit 'tail write-back' registers on the CPU.  The
// GPU's command processor can write directly to these registers.  They're
// beneficial over GPU write-backs to normal cacheable memory because they
// reduce the latency, which is particularly important for XPS: the CPU
// doesn't have to go all the way to memory to read a new write-back value.
// There is one register per 128 bytes at the following physical address
// range.  The remainder of the range is unused.

#define GPU_TAIL_WRITEBACKS                         16
#define GPU_TAIL_WRITEBACK_DELTA                    128

#define GPU_TAIL_WRITEBACK_RANGE_VIRTUAL_ADDRESS    ((DWORD) 0x7F000000)
#define GPU_TAIL_WRITEBACK_RANGE_PHYSICAL_ADDRESS   ((DWORD) 0xC0000000)
#define GPU_TAIL_WRITEBACK_RANGE_SIZE               0x00100000

// There is a particular physical memory range where the GPU knows to request
// memory from the CPU's L2 instead of from physical memory.  This is used
// solely by XPS.  The L2 set has to be 'locked' and each cache-line in the
// set has to be pre-initialized using 'dcbz128' to a virtual address
// which corresponds to a consecutive subset of this physical memory range.
// (In other words, unless the cache is pre-populated, GPU requests from this
// memory range will be lost.)

#define GPU_XPS_LOCKED_L2_RANGE_VIRTUAL_ADDRESS     ((DWORD) 0x7F100000)
#define GPU_XPS_LOCKED_L2_RANGE_PHYSICAL_ADDRESS    ((DWORD) 0xC0100000)
#define GPU_XPS_LOCKED_L2_RANGE_SIZE                0x07F00000

// Convert a CPU address into a GPU usable address.  Only 'physical memory'
// (as allocated by XPhysicalAlloc or XMemAlloc(XALLOC_MEMTYPE_PHYSICAL)) can
// be converted in this way.

static __forceinline DWORD GPU_CONVERT_CPU_TO_GPU_ADDRESS(CONST void* CpuAddress)
{
    return (((DWORD) (UINT_PTR) CpuAddress) & GPU_ADDRESS_MASK) +
        (DWORD)(((((UINT_PTR) CpuAddress) >> 20) + 0x200) & 0x1000);
}

// Convert a GPU address into a CPU usable address.  The resulting address
// is usable only if the memory was originally allocated as 'physical
// memory' by XPhysicalAlloc or XMemAlloc(XALLOC_MEMTYPE_PHYSICAL).  The
// memory will be accessed by the CPU using the memory-protection type (e.g.,
// cached or write-combining) of the original XPhysicalAlloc or XMemAlloc
// allocation.

static __forceinline void* GPU_CONVERT_GPU_TO_CPU_ADDRESS_64KB(DWORD GpuAddress)
{
    return (void*) (UINT_PTR) (GpuAddress + GPU_CPU_VIRTUAL_ADDRESS_64KB_START);
}

static __forceinline void* GPU_CONVERT_GPU_TO_CPU_ADDRESS_16MB(DWORD GpuAddress)
{
    return (void*) (UINT_PTR) (GpuAddress + GPU_CPU_VIRTUAL_ADDRESS_16MB_START);
}

static __forceinline void* GPU_CONVERT_GPU_TO_CPU_ADDRESS_4KB(DWORD GpuAddress)
{
    return (void*) (UINT_PTR) (GpuAddress + GPU_CPU_VIRTUAL_ADDRESS_4KB_START);
}

// Convert a GPU address into a CPU usable address.  All of memory can be
// accessed regardless of how the memory was originally allocated.  The
// memory will be accessed by the CPU as cached and readonly.  The hardware
// has no automatic cache coherency between write-combining and cached views
// so care must be taken to avoid stale data (e.g., if the CPU does a cached
// read of this memory, then modifies it using write-combining, then does a
// cached read again, the second read may get invalid stale cached data
// unless CPU cache control instructions are used).

static __forceinline void* GPU_CONVERT_GPU_TO_CPU_CACHED_READONLY_ADDRESS(DWORD GpuAddress)
{
    return (void*) (UINT_PTR) (GpuAddress + GPU_CPU_CACHED_READONLY_VIRTUAL_ADDRESS);
}

// Convert a CPU address into a CPU cached, readonly address.  Only 'physical
// memory' (as allocated by XPhysicalAlloc or XMemAlloc(XALLOC_MEMTYPE_PHYSICAL))
// can be converted in this way.

static __forceinline void* GPU_CONVERT_CPU_TO_CPU_CACHED_READONLY_ADDRESS(CONST void* CpuAddress)
{
    return GPU_CONVERT_GPU_TO_CPU_CACHED_READONLY_ADDRESS(GPU_CONVERT_CPU_TO_GPU_ADDRESS(CpuAddress));
}

// Convert a CPU address pointing into the XPS locked L2 cache into a GPU
// physical address.

static __forceinline DWORD GPU_CONVERT_XPS_CPU_TO_GPU_ADDRESS(CONST void* CpuAddress)
{
    return ((DWORD) (UINT_PTR) CpuAddress)
            - GPU_XPS_LOCKED_L2_RANGE_VIRTUAL_ADDRESS
            + GPU_XPS_LOCKED_L2_RANGE_PHYSICAL_ADDRESS;
}

// Convert a GPU address pointing into the XPS locked L2 cache into a CPU
// virtual address.

static __forceinline void* GPU_CONVERT_XPS_GPU_TO_CPU_ADDRESS(DWORD GpuAddress)
{
    return (void*) (UINT_PTR) (GpuAddress
            - GPU_XPS_LOCKED_L2_RANGE_PHYSICAL_ADDRESS
            + GPU_XPS_LOCKED_L2_RANGE_VIRTUAL_ADDRESS);
}

// Total size of the shader program store, in instructions:

#define GPU_INSTRUCTIONS                            4096

// Total number of constants support by a single context:

#define GPU_ALU_CONSTANTS                           512
#define GPU_BOOLEAN_CONSTANTS                       256
#define GPU_INTEGER_CONSTANTS                       32
#define GPU_FLOW_CONSTANTS                          (GPU_BOOLEAN_CONSTANTS/32 + GPU_INTEGER_CONSTANTS) // Actual register count
#define GPU_FETCH_CONSTANTS                         32
#define GPU_VERTEX_FETCH_CONSTANTS                  (3*GPU_FETCH_CONSTANTS)

// How D3D divides up the constants:

#define GPU_D3D_VERTEX_CONSTANTF_BASE               0
#define GPU_D3D_VERTEX_CONSTANTF_COUNT              256
#define GPU_D3D_VERTEX_CONSTANTI_BASE               0
#define GPU_D3D_VERTEX_CONSTANTI_COUNT              16
#define GPU_D3D_VERTEX_CONSTANTB_BASE               0
#define GPU_D3D_VERTEX_CONSTANTB_COUNT              128
#define GPU_D3D_VERTEX_FETCH_CONSTANT_BASE          26
#define GPU_D3D_VERTEX_FETCH_CONSTANT_COUNT         6 // Good for 18 streams
#define GPU_D3D_VERTEX_TEXTURE_FETCH_CONSTANT_BASE  16
#define GPU_D3D_VERTEX_TEXTURE_FETCH_CONSTANT_COUNT 10

#define GPU_D3D_PIXEL_CONSTANTF_BASE                GPU_D3D_VERTEX_CONSTANTF_COUNT
#define GPU_D3D_PIXEL_CONSTANTF_COUNT               256
#define GPU_D3D_PIXEL_CONSTANTI_BASE                GPU_D3D_VERTEX_CONSTANTI_COUNT
#define GPU_D3D_PIXEL_CONSTANTI_COUNT               16
#define GPU_D3D_PIXEL_CONSTANTB_BASE                GPU_D3D_VERTEX_CONSTANTB_COUNT
#define GPU_D3D_PIXEL_CONSTANTB_COUNT               128
#define GPU_D3D_PIXEL_TEXTURE_FETCH_CONSTANT_BASE   0
#define GPU_D3D_PIXEL_TEXTURE_FETCH_CONSTANT_COUNT  26

// Total texture fetch constants tracked by D3D runtime
// for both vertex and pixel shaders:

#define GPU_D3D_TEXTURE_FETCH_CONSTANT_BASE         0
#define GPU_D3D_TEXTURE_FETCH_CONSTANT_COUNT        26

// Fetch constant allocation strategy:
// pixel shader samplers start at texture constant 0, and proceed upwards.
// vertex shader samplers start at texture constant 16 and proceed upwards.
// The constant D3DDMAPSAMPLER is set to 16. Typically vertex shaders will use
// textures D3DDMAPSAMPLER..D3DDMAPSAMPLER+3, while pixel shaders will use
// textures 0..15. But it's up to the shader authors to sort things out.

//  0..15 : samplers 0..15 for pixel shaders
// 16..19 : samplers 0..3 for vertex shaders
//        : also samplers 16..19 for pixel shaders
// 20..25 : samplers 20..25 for pixel shaders
//          also samplers 4..9 for vertex shaders
//          also vertex streams 18..35
// 26..31 : Vertex stream fetch constants 0..17(31.High is stream 0, 31.Middle is stream 1, and so on.)

#define GPU_CONVERT_D3D_TO_HARDWARE_TEXTUREFETCHCONSTANT(X) ((X) + GPU_D3D_PIXEL_TEXTURE_FETCH_CONSTANT_BASE)
#define GPU_CONVERT_D3D_TO_HARDWARE_VERTEXFETCHCONSTANT(X) ((GPU_VERTEX_FETCH_CONSTANTS - 1) - (X))

// GPU Temporary register count (that can be used by our runtime)

#define GPU_SHADER_TEMPORARY_REGISTER_COUNT         32

// GPU Temporary register count (that the hardware supports)

#define GPU_SHADER_TEMPORARY_REGISTER_COUNT_PHYSICAL 64

// Vertex buffer maximum number of verts:

#define GPU_MAX_VERTEX_BUFFER_DIMENSION             16777216

// 2D textures have a maximum dimension of 8K by 8K:

#define GPU_MAX_TEXTURE_DIMENSION                   8192

// 1D textures have a maximum dimension of 16M:

#define GPU_MAX_1D_TEXTURE_DIMENSION                16777216

// Volume textures can have up to 1024 layers in the Z direction:

#define GPU_MAX_TEXTURE_DEPTH                       1024

// Textures have to have 4K alignment both for the base and mip levels:

#define GPU_TEXTURE_ALIGNMENT                       4096

// Textures have to have a pitch that is a multiple of 32 texels:

#define GPU_TEXTURE_TEXEL_PITCH_ALIGNMENT           32

// Linear textures have to have a pitch that is a multiple of 256 bytes:

#define GPU_LINEAR_TEXTURE_PITCH_BYTE_ALIGNMENT     256

// Texture tiles are 32x32x4 texels:

#define GPU_TEXTURE_TILE_DIMENSION                  32

#define GPU_TEXTURE_TILE_DEPTH_DIMENSION            4

// Resolve rectangles must always be 8x8 pixel aligned:

#define GPU_RESOLVE_ALIGNMENT                       8

// Maximum vertex fetch stride in dwords:

#define GPU_MAX_VERTEX_STRIDE                       255

// There's 16 interpolators for the pixel shader:

#define GPU_INTERPOLATORS                           16

// There is a pool of 128 sets of GPRs:

#define GPU_GPRS                                    128

// There's 4 channels per interpolator:

#define GPU_INTERPOLATOR_CHANNELS                   4

// Size of the post-transform cache.  VtxReuseDepth should always be
// set to this value:

#define GPU_VERTEX_REUSE_DEPTH                      14

// Distance from pixel center to outermost sample for multisampling.
// MaxSampleDist should always be set to the appropriate value:

#define GPU_MAX_SAMPLE_DIST_1X                      0
#define GPU_MAX_SAMPLE_DIST_2X                      4
#define GPU_MAX_SAMPLE_DIST_4X                      6

// Distance (in indices) which the vertex vector slot assignment leads the
// deallocation. DeallocDist should always be set to this value:

#define GPU_DEALLOC_DIST                            16

// EDRAM tiles are allocated in units of 80x16 pixels at 1X multisampling,
// 80x8 at 2X and 40x8 at 4X:

#define GPU_EDRAM_TILE_WIDTH_1X                     80
#define GPU_EDRAM_TILE_HEIGHT_1X                    16

#define GPU_EDRAM_TILE_WIDTH_2X                     80
#define GPU_EDRAM_TILE_HEIGHT_2X                    8

#define GPU_EDRAM_TILE_WIDTH_4X                     40
#define GPU_EDRAM_TILE_HEIGHT_4X                    8

// EDRAM tile size in fragments instead of pixels:

#define GPU_EDRAM_TILE_WIDTH_IN_FRAGMENTS           GPU_EDRAM_TILE_WIDTH_1X
#define GPU_EDRAM_TILE_HEIGHT_IN_FRAGMENTS          GPU_EDRAM_TILE_HEIGHT_1X

// An EDRAM tile size, in bytes.  Note that 64-bit surfaces have an
// allocation granularity of twice this (10240 bytes) but can have a 5120
// byte start alignment:

#define GPU_EDRAM_TILE_SIZE                         5120

// Total number of usable EDRAM tiles:

#define GPU_EDRAM_TILES                             (GPU_EDRAM_SIZE / GPU_EDRAM_TILE_SIZE)

// Hierarchical Z tiles are allocated in units of 32x16 at 1X multisampilng,
// 32x8 at 2X and 16x8 at 4X:

#define GPU_HIERARCHICAL_Z_TILE_WIDTH_1X            32
#define GPU_HIERARCHICAL_Z_TILE_HEIGHT_1X           16

#define GPU_HIERARCHICAL_Z_TILE_WIDTH_2X            32
#define GPU_HIERARCHICAL_Z_TILE_HEIGHT_2X           8

#define GPU_HIERARCHICAL_Z_TILE_WIDTH_4X            16
#define GPU_HIERARCHICAL_Z_TILE_HEIGHT_4X           8

// Hierarchical Z tile size in fragments instead of pixels:

#define GPU_HIERARCHICAL_Z_TILE_WIDTH_IN_FRAGMENTS  GPU_HIERARCHICAL_Z_TILE_WIDTH_1X
#define GPU_HIERARCHICAL_Z_TILE_HEIGHT_IN_FRAGMENTS GPU_HIERARCHICAL_Z_TILE_HEIGHT_1X

// A hierarchical Z tile size, in fragments:

#define GPU_HIERARCHICAL_Z_TILE_SIZE                512

// Total number of usable hierarchical Z tiles.  This is enough for 1280x720x2X:

#define GPU_HIERARCHICAL_Z_TILES                    3600

// Resource address shift amounts for storage in header fields:

#define GPU_VERTEXBUFFER_ADDRESS_SHIFT              2
#define GPU_VERTEXBUFFER_SIZE_SHIFT                 2
#define GPU_TEXTURE_ADDRESS_SHIFT                   12

// GPU callable command buffers require a 32 byte alignment:

#define GPU_COMMAND_BUFFER_ALIGNMENT                32

// GPU callable indirect command buffers have a 1 MB DWORD maximum size:

#define GPU_COMMAND_BUFFER_INDIRECT_MAX_SIZE        (1 << 20)

// Masks and shifts for GPU control flow exec instruction
// TypeAndSerialize field

#define GPUEXEC_TYPE_SHIFT                          0
#define GPUEXEC_TYPE_MASK                           1 // Set if Fetch instruction
#define GPUEXEC_SERIALIZE_SHIFT                     1
#define GPUEXEC_SERIALIZE_MASK                      2 // Set if serialized

// Maximum number of ALU/Fetch instructions in an Exec:

#define GPUFLOW_MAX_EXEC_COUNT                      6

// Size of the guard band:

#define GPU_GUARDBAND                               8192

// Helper for calculating guard band factors for a given number of pixels
// and a given scale. (Scale is 1/2 the height or width of the screen.):

#define GPU_GUARDBANDFACTOR(GUARDBAND_PIXELS, SCALE) (1.0f + ((float) (GUARDBAND_PIXELS)) / ((float) (SCALE)))

// Maximum size of a sprite:

#define GPU_MAX_POINT_SIZE                         256.0f

//------------------------------------------------------------------------------

typedef enum
{
    GPUENDIAN_NONE                                  = 0,
    GPUENDIAN_8IN16                                 = 1,
    GPUENDIAN_8IN32                                 = 2,
    GPUENDIAN_16IN32                                = 3,
} GPUENDIAN;

typedef enum
{
    GPUENDIAN128_NONE                               = 0,
    GPUENDIAN128_8IN16                              = 1,
    GPUENDIAN128_8IN32                              = 2,
    GPUENDIAN128_16IN32                             = 3,
    GPUENDIAN128_8IN64                              = 4,
    GPUENDIAN128_8IN128                             = 5,
} GPUENDIAN128;

typedef enum
{
    GPUCOLORARRAY_2D_COLOR                          = 0,
    GPUCOLORARRAY_3D_SLICE_COLOR                    = 1,
} GPUCOLORARRAY;

typedef enum
{
    GPUDEPTHARRAY_2D_ALT_DEPTH                      = 0,
    GPUDEPTHARRAY_2D_DEPTH                          = 1,
} GPUDEPTHARRAY;

typedef enum
{
    GPUCOLORFORMAT_8                                = 2,
    GPUCOLORFORMAT_1_5_5_5                          = 3,
    GPUCOLORFORMAT_5_6_5                            = 4,
    GPUCOLORFORMAT_6_5_5                            = 5,
    GPUCOLORFORMAT_8_8_8_8                          = 6,
    GPUCOLORFORMAT_2_10_10_10                       = 7,
    GPUCOLORFORMAT_8_A                              = 8,
    GPUCOLORFORMAT_8_B                              = 9,
    GPUCOLORFORMAT_8_8                              = 10,
    GPUCOLORFORMAT_8_8_8_8_A                        = 14,
    GPUCOLORFORMAT_4_4_4_4                          = 15,
    GPUCOLORFORMAT_10_11_11                         = 16,
    GPUCOLORFORMAT_11_11_10                         = 17,
    GPUCOLORFORMAT_16                               = 24,
    GPUCOLORFORMAT_16_16                            = 25,
    GPUCOLORFORMAT_16_16_16_16                      = 26,
    GPUCOLORFORMAT_16_FLOAT                         = 30,
    GPUCOLORFORMAT_16_16_FLOAT                      = 31,
    GPUCOLORFORMAT_16_16_16_16_FLOAT                = 32,
    GPUCOLORFORMAT_32_FLOAT                         = 36,
    GPUCOLORFORMAT_32_32_FLOAT                      = 37,
    GPUCOLORFORMAT_32_32_32_32_FLOAT                = 38,
    GPUCOLORFORMAT_2_10_10_10_FLOAT                 = 62, // EDRAM render target only
} GPUCOLORFORMAT;

typedef enum
{
    GPUTEXTUREFORMAT_1_REVERSE                      = 0,
    GPUTEXTUREFORMAT_1                              = 1,
    GPUTEXTUREFORMAT_8                              = 2,
    GPUTEXTUREFORMAT_1_5_5_5                        = 3,
    GPUTEXTUREFORMAT_5_6_5                          = 4,
    GPUTEXTUREFORMAT_6_5_5                          = 5,
    GPUTEXTUREFORMAT_8_8_8_8                        = 6,
    GPUTEXTUREFORMAT_2_10_10_10                     = 7,
    GPUTEXTUREFORMAT_8_A                            = 8,
    GPUTEXTUREFORMAT_8_B                            = 9,
    GPUTEXTUREFORMAT_8_8                            = 10,
    GPUTEXTUREFORMAT_Cr_Y1_Cb_Y0_REP                = 11,
    GPUTEXTUREFORMAT_Y1_Cr_Y0_Cb_REP                = 12,
    GPUTEXTUREFORMAT_16_16_EDRAM                    = 13, // EDRAM render target only
    GPUTEXTUREFORMAT_8_8_8_8_A                      = 14,
    GPUTEXTUREFORMAT_4_4_4_4                        = 15,
    GPUTEXTUREFORMAT_10_11_11                       = 16,
    GPUTEXTUREFORMAT_11_11_10                       = 17,
    GPUTEXTUREFORMAT_DXT1                           = 18,
    GPUTEXTUREFORMAT_DXT2_3                         = 19,
    GPUTEXTUREFORMAT_DXT4_5                         = 20,
    GPUTEXTUREFORMAT_16_16_16_16_EDRAM              = 21, // EDRAM render target only
    GPUTEXTUREFORMAT_24_8                           = 22,
    GPUTEXTUREFORMAT_24_8_FLOAT                     = 23,
    GPUTEXTUREFORMAT_16                             = 24,
    GPUTEXTUREFORMAT_16_16                          = 25,
    GPUTEXTUREFORMAT_16_16_16_16                    = 26,
    GPUTEXTUREFORMAT_16_EXPAND                      = 27,
    GPUTEXTUREFORMAT_16_16_EXPAND                   = 28,
    GPUTEXTUREFORMAT_16_16_16_16_EXPAND             = 29,
    GPUTEXTUREFORMAT_16_FLOAT                       = 30,
    GPUTEXTUREFORMAT_16_16_FLOAT                    = 31,
    GPUTEXTUREFORMAT_16_16_16_16_FLOAT              = 32,
    GPUTEXTUREFORMAT_32                             = 33,
    GPUTEXTUREFORMAT_32_32                          = 34,
    GPUTEXTUREFORMAT_32_32_32_32                    = 35,
    GPUTEXTUREFORMAT_32_FLOAT                       = 36,
    GPUTEXTUREFORMAT_32_32_FLOAT                    = 37,
    GPUTEXTUREFORMAT_32_32_32_32_FLOAT              = 38,
    GPUTEXTUREFORMAT_32_AS_8                        = 39,
    GPUTEXTUREFORMAT_32_AS_8_8                      = 40,
    GPUTEXTUREFORMAT_16_MPEG                        = 41,
    GPUTEXTUREFORMAT_16_16_MPEG                     = 42,
    GPUTEXTUREFORMAT_8_INTERLACED                   = 43,
    GPUTEXTUREFORMAT_32_AS_8_INTERLACED             = 44,
    GPUTEXTUREFORMAT_32_AS_8_8_INTERLACED           = 45,
    GPUTEXTUREFORMAT_16_INTERLACED                  = 46,
    GPUTEXTUREFORMAT_16_MPEG_INTERLACED             = 47,
    GPUTEXTUREFORMAT_16_16_MPEG_INTERLACED          = 48,
    GPUTEXTUREFORMAT_DXN                            = 49,
    GPUTEXTUREFORMAT_8_8_8_8_AS_16_16_16_16         = 50,
    GPUTEXTUREFORMAT_DXT1_AS_16_16_16_16            = 51,
    GPUTEXTUREFORMAT_DXT2_3_AS_16_16_16_16          = 52,
    GPUTEXTUREFORMAT_DXT4_5_AS_16_16_16_16          = 53,
    GPUTEXTUREFORMAT_2_10_10_10_AS_16_16_16_16      = 54,
    GPUTEXTUREFORMAT_10_11_11_AS_16_16_16_16        = 55,
    GPUTEXTUREFORMAT_11_11_10_AS_16_16_16_16        = 56,
    GPUTEXTUREFORMAT_32_32_32_FLOAT                 = 57,
    GPUTEXTUREFORMAT_DXT3A                          = 58,
    GPUTEXTUREFORMAT_DXT5A                          = 59,
    GPUTEXTUREFORMAT_CTX1                           = 60,
    GPUTEXTUREFORMAT_DXT3A_AS_1_1_1_1               = 61,
    GPUTEXTUREFORMAT_8_8_8_8_GAMMA_EDRAM            = 62, // EDRAM render target only
    GPUTEXTUREFORMAT_2_10_10_10_FLOAT_EDRAM         = 63, // EDRAM render target only
} GPUTEXTUREFORMAT;

typedef enum
{
    GPUVERTEXFORMAT_8_8_8_8                         = 6,
    GPUVERTEXFORMAT_2_10_10_10                      = 7,
    GPUVERTEXFORMAT_10_11_11                        = 16,
    GPUVERTEXFORMAT_11_11_10                        = 17,
    GPUVERTEXFORMAT_16_16                           = 25,
    GPUVERTEXFORMAT_16_16_16_16                     = 26,
    GPUVERTEXFORMAT_16_16_FLOAT                     = 31,
    GPUVERTEXFORMAT_16_16_16_16_FLOAT               = 32,
    GPUVERTEXFORMAT_32                              = 33,
    GPUVERTEXFORMAT_32_32                           = 34,
    GPUVERTEXFORMAT_32_32_32_32                     = 35,
    GPUVERTEXFORMAT_32_FLOAT                        = 36,
    GPUVERTEXFORMAT_32_32_FLOAT                     = 37,
    GPUVERTEXFORMAT_32_32_32_32_FLOAT               = 38,
    GPUVERTEXFORMAT_32_32_32_FLOAT                  = 57,
} GPUVERTEXFORMAT;

typedef enum
{
    GPUEDRAMCOLORFORMAT_8_8_8_8                     = 0,
    GPUEDRAMCOLORFORMAT_8_8_8_8_GAMMA               = 1,
    GPUEDRAMCOLORFORMAT_2_10_10_10                  = 2,
    GPUEDRAMCOLORFORMAT_2_10_10_10_FLOAT            = 3,
    GPUEDRAMCOLORFORMAT_16_16                       = 4,
    GPUEDRAMCOLORFORMAT_16_16_16_16                 = 5,
    GPUEDRAMCOLORFORMAT_16_16_FLOAT                 = 6,
    GPUEDRAMCOLORFORMAT_16_16_16_16_FLOAT           = 7,
    GPUEDRAMCOLORFORMAT_2_10_10_10_AS_10_10_10_10   = 10,
    GPUEDRAMCOLORFORMAT_2_10_10_10_FLOAT_AS_16_16_16_16 = 12,
    GPUEDRAMCOLORFORMAT_32_FLOAT                    = 14,
    GPUEDRAMCOLORFORMAT_32_32_FLOAT                 = 15,
} GPUEDRAMCOLORFORMAT;

typedef enum
{
    GPUEDRAMDEPTHFORMAT_24_8                        = 0,
    GPUEDRAMDEPTHFORMAT_24_8_FLOAT                  = 1,
} GPUEDRAMDEPTHFORMAT;

typedef enum
{
    GPUSIGN_UNSIGNED                                = 0,
    GPUSIGN_SIGNED                                  = 1,
    GPUSIGN_BIAS                                    = 2,
    GPUSIGN_GAMMA                                   = 3,
} GPUSIGN;

#define GPUSIGN_ALL_UNSIGNED (GPUSIGN_UNSIGNED | GPUSIGN_UNSIGNED<<2 | GPUSIGN_UNSIGNED<<4 | GPUSIGN_UNSIGNED<<6)
#define GPUSIGN_ALL_SIGNED (GPUSIGN_SIGNED | GPUSIGN_SIGNED<<2 | GPUSIGN_SIGNED<<4 | GPUSIGN_SIGNED<<6)

typedef enum
{
    GPUSWIZZLE_X                                    = 0,
    GPUSWIZZLE_Y                                    = 1,
    GPUSWIZZLE_Z                                    = 2,
    GPUSWIZZLE_W                                    = 3,
    GPUSWIZZLE_0                                    = 4,
    GPUSWIZZLE_1                                    = 5,
    GPUSWIZZLE_KEEP                                 = 7, // Fetch instructions only
} GPUSWIZZLE;

#define GPUSWIZZLE_ARGB (GPUSWIZZLE_Z | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_X<<6 | GPUSWIZZLE_W<<9)
#define GPUSWIZZLE_ORGB (GPUSWIZZLE_Z | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_X<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_ABGR (GPUSWIZZLE_X | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_Z<<6 | GPUSWIZZLE_W<<9)
#define GPUSWIZZLE_OBGR (GPUSWIZZLE_X | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_Z<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_OOGR (GPUSWIZZLE_X | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_1<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_OZGR (GPUSWIZZLE_X | GPUSWIZZLE_Y<<3 | GPUSWIZZLE_0<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_RZZZ (GPUSWIZZLE_0 | GPUSWIZZLE_0<<3 | GPUSWIZZLE_0<<6 | GPUSWIZZLE_X<<9)
#define GPUSWIZZLE_OOOR (GPUSWIZZLE_X | GPUSWIZZLE_1<<3 | GPUSWIZZLE_1<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_ORRR (GPUSWIZZLE_X | GPUSWIZZLE_X<<3 | GPUSWIZZLE_X<<6 | GPUSWIZZLE_1<<9)
#define GPUSWIZZLE_GRRR (GPUSWIZZLE_X | GPUSWIZZLE_X<<3 | GPUSWIZZLE_X<<6 | GPUSWIZZLE_Y<<9)
#define GPUSWIZZLE_RGBA (GPUSWIZZLE_W | GPUSWIZZLE_Z<<3 | GPUSWIZZLE_Y<<6 | GPUSWIZZLE_X<<9)

typedef enum
{
    GPUNUMFORMAT_FRACTION                           = 0,
    GPUNUMFORMAT_INTEGER                            = 1,
} GPUNUMFORMAT;

typedef enum
{
    GPUCONSTANTTYPE_INVALID_TEXTURE                 = 0,
    GPUCONSTANTTYPE_INVALID_VERTEX                  = 1,
    GPUCONSTANTTYPE_TEXTURE                         = 2,
    GPUCONSTANTTYPE_VERTEX                          = 3,
} GPUCONSTANTTYPE;

typedef enum
{
    GPUCLAMP_WRAP                                   = 0,
    GPUCLAMP_MIRROR                                 = 1,
    GPUCLAMP_CLAMP_TO_LAST                          = 2,
    GPUCLAMP_MIRROR_ONCE_TO_LAST                    = 3,
    GPUCLAMP_CLAMP_HALFWAY                          = 4,
    GPUCLAMP_MIRROR_ONCE_HALFWAY                    = 5,
    GPUCLAMP_CLAMP_TO_BORDER                        = 6,
    GPUCLAMP_MIRROR_TO_BORDER                       = 7,
} GPUCLAMP;

typedef enum
{
    GPUDIMENSION_1D                                 = 0,
    GPUDIMENSION_2D                                 = 1,
    GPUDIMENSION_3D                                 = 2,
    GPUDIMENSION_CUBEMAP                            = 3,
} GPUDIMENSION;

typedef enum
{
    GPUREQUESTSIZE_256BIT                           = 0,
    GPUREQUESTSIZE_512BIT                           = 1,
} GPUREQUESTSIZE;

typedef enum
{
    GPUCLAMPPOLICY_D3D                              = 0,
    GPUCLAMPPOLICY_OGL                              = 1,
} GPUCLAMPPOLICY;

typedef enum
{
    GPUMINMAGFILTER_POINT                           = 0,
    GPUMINMAGFILTER_LINEAR                          = 1,
    GPUMINMAGFILTER_KEEP                            = 3, // Texture fetch instructions only
} GPUMINMAGFILTER;

typedef enum
{
    GPUMIPFILTER_POINT                              = 0,
    GPUMIPFILTER_LINEAR                             = 1,
    GPUMIPFILTER_BASEMAP                            = 2,
    GPUMIPFILTER_KEEP                               = 3, // Texture fetch instructions only
} GPUMIPFILTER;

typedef enum
{
    GPUANISOFILTER_DISABLED                         = 0,
    GPUANISOFILTER_MAX1TO1                          = 1,
    GPUANISOFILTER_MAX2TO1                          = 2,
    GPUANISOFILTER_MAX4TO1                          = 3,
    GPUANISOFILTER_MAX8TO1                          = 4,
    GPUANISOFILTER_MAX16TO1                         = 5 ,
    GPUANISOFILTER_KEEP                             = 7, // Texture fetch instructions only
} GPUANISOFILTER;

typedef enum
{
    GPUBORDERCOLOR_ABGR_BLACK                       = 0,
    GPUBORDERCOLOR_ABGR_WHITE                       = 1,
    GPUBORDERCOLOR_ACBYCR_BLACK                     = 2,
    GPUBORDERCOLOR_ACBCRY_BLACK                     = 3,
} GPUBORDERCOLOR;

typedef enum
{
    GPUTRICLAMP_NORMAL                              = 0,
    GPUTRICLAMP_ONE_SIXTH                           = 1,
    GPUTRICLAMP_ONE_FOURTH                          = 2,
    GPUTRICLAMP_THREE_EIGHTHS                       = 3,
} GPUTRICLAMP;

typedef enum
{
    GPUADDRESSCLAMP_CLAMP_TO_LAST                   = 0,
    GPUADDRESSCLAMP_CLAMP_TO_CONSTANT               = 1,
} GPUADDRESSCLAMP;

typedef enum
{
    GPUSURFACENUMBER_UREPEAT                        = 0,
    GPUSURFACENUMBER_SREPEAT                        = 1,
    GPUSURFACENUMBER_UINTEGER                       = 2,
    GPUSURFACENUMBER_SINTEGER                       = 3,
    GPUSURFACENUMBER_FLOAT                          = 7,
} GPUSURFACENUMBER;

typedef enum
{
    SURFACESWAP_LOW_RED                             = 0,
    SURFACESWAP_LOW_BLUE                            = 1,
} GPUSURFACESWAP;

typedef enum
{
    GPUPRIMTYPE_NONE                                = 0,
    GPUPRIMTYPE_POINTLIST                           = 1,
    GPUPRIMTYPE_LINELIST                            = 2,
    GPUPRIMTYPE_LINESTRIP                           = 3,
    GPUPRIMTYPE_TRILIST                             = 4,
    GPUPRIMTYPE_TRIFAN                              = 5,
    GPUPRIMTYPE_TRISTRIP                            = 6,
    GPUPRIMTYPE_TRI_WITH_WFLAGS                     = 7,
    GPUPRIMTYPE_RECTLIST                            = 8,
    GPUPRIMTYPE_LINELOOP                            = 12,
    GPUPRIMTYPE_QUADLIST                            = 13,
    GPUPRIMTYPE_QUADSTRIP                           = 14,
    GPUPRIMTYPE_POLYGON                             = 15,
    GPUPRIMTYPE_2D_COPY_RECT_LIST_V0                = 16,
    GPUPRIMTYPE_2D_COPY_RECT_LIST_V1                = 17,
    GPUPRIMTYPE_2D_COPY_RECT_LIST_V2                = 18,
    GPUPRIMTYPE_2D_COPY_RECT_LIST_V3                = 19,
    GPUPRIMTYPE_2D_FILL_RECT_LIST                   = 20,
    GPUPRIMTYPE_2D_LINE_STRIP                       = 21,
    GPUPRIMTYPE_2D_TRI_STRIP                        = 22,
} GPUPRIMTYPE;

typedef enum
{
    GPUGROUPPRIMTYPE_3D_POINT                       = 0,
    GPUGROUPPRIMTYPE_3D_LINE                        = 1,
    GPUGROUPPRIMTYPE_3D_TRI                         = 2,
    GPUGROUPPRIMTYPE_3D_RECT                        = 3,
    GPUGROUPPRIMTYPE_3D_QUAD                        = 4,
    GPUGROUPPRIMTYPE_2D_COPY_RECT_V0                = 5,
    GPUGROUPPRIMTYPE_2D_COPY_RECT_V1                = 6,
    GPUGROUPPRIMTYPE_2D_COPY_RECT_V2                = 7,
    GPUGROUPPRIMTYPE_2D_COPY_RECT_V3                = 8,
    GPUGROUPPRIMTYPE_2D_FILL_RECT                   = 9,
    GPUGROUPPRIMTYPE_2D_LINE                        = 10,
    GPUGROUPPRIMTYPE_2D_TRI                         = 11,
    GPUGROUPPRIMTYPE_PRIM_INDEX_LINE                = 12,
    GPUGROUPPRIMTYPE_PRIM_INDEX_TRI                 = 13,
    GPUGROUPPRIMTYPE_PRIM_INDEX_QUAD                = 14,
} GPUGROUPPRIMTYPE;

typedef enum
{
    GPUGROUPPRIMORDER_LIST                          = 0,
    GPUGROUPPRIMORDER_STRIP                         = 1,
    GPUGROUPPRIMORDER_FAN                           = 2,
    GPUGROUPPRIMORDER_LOOP                          = 3,
    GPUGROUPPRIMORDER_POLYGON                       = 4,
} GPUGROUPPRIMORDER;

typedef enum
{
    GPUGROUPCONV_INDEX_16                           = 0,
    GPUGROUPCONV_INDEX_32                           = 1,
    GPUGROUPCONV_UINT_16                            = 2,
    GPUGROUPCONV_UINT_32                            = 3,
    GPUGROUPCONV_SINT_16                            = 4,
    GPUGROUPCONV_SINT_32                            = 5,
    GPUGROUPCONV_FLOAT_32                           = 6,
    GPUGROUPCONV_AUTO_PRIM                          = 7,
    GPUGROUPCONV_FIX_1_23_TO_FLOAT                  = 8,
} GPUGROUPCONV;

typedef enum
{
    GPUCMP_NEVER                                    = 0,
    GPUCMP_LESS                                     = 1,
    GPUCMP_EQUAL                                    = 2,
    GPUCMP_LESS_EQUAL                               = 3,
    GPUCMP_GREATER                                  = 4,
    GPUCMP_NOT_EQUAL                                = 5,
    GPUCMP_GREATER_EQUAL                            = 6,
    GPUCMP_ALWAYS                                   = 7,
} GPUCMPFUNC;

typedef enum
{
    GPUSTENCILOP_KEEP                               = 0,
    GPUSTENCILOP_ZERO                               = 1,
    GPUSTENCILOP_REPLACE                            = 2,
    GPUSTENCILOP_INCRSAT                            = 3,
    GPUSTENCILOP_DECRSAT                            = 4,
    GPUSTENCILOP_INVERT                             = 5,
    GPUSTENCILOP_INCR                               = 6,
    GPUSTENCILOP_DECR                               = 7,
} GPUSTENCILOP;

typedef enum
{
    GPUBLEND_ZERO                                   = 0,
    GPUBLEND_ONE                                    = 1,
    GPUBLEND_SRCCOLOR                               = 4,
    GPUBLEND_INVSRCCOLOR                            = 5,
    GPUBLEND_SRCALPHA                               = 6,
    GPUBLEND_INVSRCALPHA                            = 7,
    GPUBLEND_DESTCOLOR                              = 8,
    GPUBLEND_INVDESTCOLOR                           = 9,
    GPUBLEND_DESTALPHA                              = 10,
    GPUBLEND_INVDESTALPHA                           = 11,
    GPUBLEND_BLENDFACTOR                            = 12,
    GPUBLEND_INVBLENDFACTOR                         = 13,
    GPUBLEND_CONSTANTALPHA                          = 14,
    GPUBLEND_INVCONSTANTALPHA                       = 15,
    GPUBLEND_SRCALPHASAT                            = 16,
} GPUBLEND;

typedef enum
{
    GPUBLENDOP_ADD                                  = 0,
    GPUBLENDOP_SUBTRACT                             = 1,
    GPUBLENDOP_MIN                                  = 2,
    GPUBLENDOP_MAX                                  = 3,
    GPUBLENDOP_REVSUBTRACT                          = 4,
} GPUBLENDOP;

typedef enum
{
    GPUCULL_NONE_FRONTFACE_CCW                      = 0x0,
    GPUCULL_FRONT_FRONTFACE_CCW                     = 0x1,
    GPUCULL_BACK_FRONTFACE_CCW                      = 0x2,
    GPUCULL_NONE_FRONTFACE_CW                       = 0x4,
    GPUCULL_FRONT_FRONTFACE_CW                      = 0x5,
    GPUCULL_BACK_FRONTFACE_CW                       = 0x6,
} GPUCULL;

typedef enum
{
    GPUFILL_POINT                                   = 0,
    GPUFILL_WIREFRAME                               = 1,
    GPUFILL_SOLID                                   = 2,
} GPUFILLMODE;

typedef enum
{
    GPUVERTEXSIGN_UNSIGNED                          = 0,
    GPUVERTEXSIGN_SIGNED                            = 1
} GPUVERTEXSIGN;

typedef enum
{
    GPUVERTEXFETCHOP_FETCH_VERTEX                   = 0,
} GPUVERTEXFETCHOP;

typedef enum
{
    GPUTEXTUREFETCHOP_FETCH_TEXTURE_MAP             = 1,
    GPUTEXTUREFETCHOP_GET_BORDER_COLOR_FRACTION     = 16,
    GPUTEXTUREFETCHOP_GET_COMPUTED_TEX_LOD          = 17,
    GPUTEXTUREFETCHOP_GET_GRADIENTS                 = 18,
    GPUTEXTUREFETCHOP_GET_WEIGHTS                   = 19,
    GPUTEXTUREFETCHOP_SET_TEX_LOD                   = 24,
    GPUTEXTUREFETCHOP_SET_GRADIENTS_H               = 25,
    GPUTEXTUREFETCHOP_SET_GRADIENTS_V               = 26,
} GPUTEXTUREFETCHOP;

typedef enum
{
    GPUFLOWOP_NOP                                   = 0,
    GPUFLOWOP_EXEC                                  = 1,
    GPUFLOWOP_EXEC_END                              = 2,
    GPUFLOWOP_COND_EXEC                             = 3,
    GPUFLOWOP_COND_EXEC_END                         = 4,
    GPUFLOWOP_COND_EXEC_PRED                        = 5,
    GPUFLOWOP_COND_EXEC_PRED_END                    = 6,
    GPUFLOWOP_LOOP_START                            = 7,
    GPUFLOWOP_LOOP_END                              = 8,
    GPUFLOWOP_COND_CALL                             = 9,
    GPUFLOWOP_RETURN                                = 10,
    GPUFLOWOP_COND_JUMP                             = 11,
    GPUFLOWOP_ALLOC                                 = 12,
    GPUFLOWOP_COND_EXEC_PRED_CLEAN                  = 13,
    GPUFLOWOP_COND_EXEC_PRED_CLEAN_END              = 14,
    GPUFLOWOP_VFETCH_END                            = 15,
} GPUFLOWOP;

typedef enum
{
    GPUALUSCALAROP_ADD                              = 0,
    GPUALUSCALAROP_ADDPREV                          = 1,
    GPUALUSCALAROP_MUL                              = 2,
    GPUALUSCALAROP_MULPREV                          = 3,
    GPUALUSCALAROP_MULPREV2                         = 4,
    GPUALUSCALAROP_MAX                              = 5,
    GPUALUSCALAROP_MIN                              = 6,
    GPUALUSCALAROP_SEQ                              = 7,
    GPUALUSCALAROP_SGT                              = 8,
    GPUALUSCALAROP_SGE                              = 9,
    GPUALUSCALAROP_SNE                              = 10,
    GPUALUSCALAROP_FRC                              = 11,
    GPUALUSCALAROP_TRUNC                            = 12,
    GPUALUSCALAROP_FLOOR                            = 13,
    GPUALUSCALAROP_EXP                              = 14,
    GPUALUSCALAROP_LOGC                             = 15,
    GPUALUSCALAROP_LOG                              = 16,
    GPUALUSCALAROP_RCPC                             = 17,
    GPUALUSCALAROP_RCPF                             = 18,
    GPUALUSCALAROP_RCP                              = 19,
    GPUALUSCALAROP_RSQC                             = 20,
    GPUALUSCALAROP_RSQF                             = 21,
    GPUALUSCALAROP_RSQ                              = 22,
    GPUALUSCALAROP_MAXA                             = 23,
    GPUALUSCALAROP_MAXAF                            = 24,
    GPUALUSCALAROP_SUB                              = 25,
    GPUALUSCALAROP_SUBPREV                          = 26,
    GPUALUSCALAROP_SETPEQ                           = 27,
    GPUALUSCALAROP_SETPNE                           = 28,
    GPUALUSCALAROP_SETPGT                           = 29,
    GPUALUSCALAROP_SETPGE                           = 30,
    GPUALUSCALAROP_SETPINV                          = 31,
    GPUALUSCALAROP_SETPPOP                          = 32,
    GPUALUSCALAROP_SETPCLR                          = 33,
    GPUALUSCALAROP_SETPRSTR                         = 34,
    GPUALUSCALAROP_KILLEQ                           = 35,
    GPUALUSCALAROP_KILLGT                           = 36,
    GPUALUSCALAROP_KILLGE                           = 37,
    GPUALUSCALAROP_KILLNE                           = 38,
    GPUALUSCALAROP_KILLONE                          = 39,
    GPUALUSCALAROP_SQRT                             = 40,
    GPUALUSCALAROP_MULC0                            = 42,
    GPUALUSCALAROP_MULC1                            = 43,
    GPUALUSCALAROP_ADDC0                            = 44,
    GPUALUSCALAROP_ADDC1                            = 45,
    GPUALUSCALAROP_SUBC0                            = 46,
    GPUALUSCALAROP_SUBC1                            = 47,
    GPUALUSCALAROP_SIN                              = 48,
    GPUALUSCALAROP_COS                              = 49,
    GPUALUSCALAROP_RETAINPREV                       = 50
} GPUALUSCALAROP;

typedef enum
{
    GPUALUVECTOROP_ADD                              = 0,
    GPUALUVECTOROP_MUL                              = 1,
    GPUALUVECTOROP_MAX                              = 2,
    GPUALUVECTOROP_MIN                              = 3,
    GPUALUVECTOROP_SEQ                              = 4,
    GPUALUVECTOROP_SGT                              = 5,
    GPUALUVECTOROP_SGE                              = 6,
    GPUALUVECTOROP_SNE                              = 7,
    GPUALUVECTOROP_FRC                              = 8,
    GPUALUVECTOROP_TRUNC                            = 9,
    GPUALUVECTOROP_FLOOR                            = 10,
    GPUALUVECTOROP_MAD                              = 11,
    GPUALUVECTOROP_CNDEQ                            = 12,
    GPUALUVECTOROP_CNDGE                            = 13,
    GPUALUVECTOROP_CNDGT                            = 14,
    GPUALUVECTOROP_DP4                              = 15,
    GPUALUVECTOROP_DP3                              = 16,
    GPUALUVECTOROP_DP2ADD                           = 17,
    GPUALUVECTOROP_CUBE                             = 18,
    GPUALUVECTOROP_MAX4                             = 19,
    GPUALUVECTOROP_SETPEQP                          = 20,
    GPUALUVECTOROP_SETPNEP                          = 21,
    GPUALUVECTOROP_SETPGTP                          = 22,
    GPUALUVECTOROP_SETPGEP                          = 23,
    GPUALUVECTOROP_KILLEQ                           = 24,
    GPUALUVECTOROP_KILLGT                           = 25,
    GPUALUVECTOROP_KILLGE                           = 26,
    GPUALUVECTOROP_KILLNE                           = 27,
    GPUALUVECTOROP_DST                              = 28,
    GPUALUVECTOROP_MAXA                             = 29
} GPUALUVECTOROP;

typedef enum
{
    GPUALUSRCSELECT_C                               = 0,
    GPUALUSRCSELECT_R                               = 1
} GPUALUSRCSELECT;

typedef enum
{
    GPUEXPORTREGISTER_PS_COLOR_0                    = 0,
    GPUEXPORTREGISTER_PS_COLOR_1                    = 1,
    GPUEXPORTREGISTER_PS_COLOR_2                    = 2,
    GPUEXPORTREGISTER_PS_COLOR_3                    = 3,

    GPUEXPORTREGISTER_PS_EXPORT_ADDRESS             = 32,
    GPUEXPORTREGISTER_PS_EXPORT_DATA_0              = 33,
    GPUEXPORTREGISTER_PS_EXPORT_DATA_1              = 34,
    GPUEXPORTREGISTER_PS_EXPORT_DATA_2              = 35,
    GPUEXPORTREGISTER_PS_EXPORT_DATA_3              = 36,
    GPUEXPORTREGISTER_PS_EXPORT_DATA_4              = 37,

    GPUEXPORTREGISTER_PS_DEPTH                      = 61,
} GPUEXPORTREGISTER_PS;

typedef enum
{
    GPUEXPORTREGISTER_VS_INTERPOLATOR_0             = 0,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_1             = 1,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_2             = 2,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_3             = 3,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_4             = 4,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_5             = 5,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_6             = 6,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_7             = 7,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_8             = 8,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_9             = 9,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_10            = 10,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_11            = 11,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_12            = 12,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_13            = 13,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_14            = 14,
    GPUEXPORTREGISTER_VS_INTERPOLATOR_15            = 15,

    GPUEXPORTREGISTER_VS_EXPORT_ADDRESS             = 32,
    GPUEXPORTREGISTER_VS_EXPORT_DATA_0              = 33,
    GPUEXPORTREGISTER_VS_EXPORT_DATA_1              = 34,
    GPUEXPORTREGISTER_VS_EXPORT_DATA_2              = 35,
    GPUEXPORTREGISTER_VS_EXPORT_DATA_3              = 36,
    GPUEXPORTREGISTER_VS_EXPORT_DATA_4              = 37,

    GPUEXPORTREGISTER_VS_POSITION                   = 62,
    GPUEXPORTREGISTER_VS_SPRITE_EDGE_KILL           = 63,
} GPUEXPORTREGISTER_VS;

typedef enum
{
    GPUCOMMANDOP_NOP                                = 0x10,
    GPUCOMMANDOP_REG_RMW                            = 0x21,
    GPUCOMMANDOP_DRAW                               = 0x22,
    GPUCOMMANDOP_VIZ_QUERY                          = 0x23,
    GPUCOMMANDOP_SET_STATE                          = 0x25,
    GPUCOMMANDOP_WAIT_FOR_IDLE                      = 0x26,
    GPUCOMMANDOP_LOAD_SHADER                        = 0x27,
    GPUCOMMANDOP_LOAD_SHADER_IMMEDIATE              = 0x2b,
    GPUCOMMANDOP_SET_CONSTANT                       = 0x2d,
    GPUCOMMANDOP_LOAD_ALU_CONSTANT                  = 0x2f,
    GPUCOMMANDOP_DRAW_IMMEDIATE                     = 0x36,
    GPUCOMMANDOP_MPEG_INDEX                         = 0x3a,
    GPUCOMMANDOP_INVALIDATE_STATE                   = 0x3b,
    GPUCOMMANDOP_WAIT_REG_MEM                       = 0x3c,
    GPUCOMMANDOP_MEM_WRITE                          = 0x3d,
    GPUCOMMANDOP_REG_TO_MEM                         = 0x3e,
    GPUCOMMANDOP_INDIRECT_BUFFER                    = 0x3f,
    GPUCOMMANDOP_COND_WRITE                         = 0x45,
    GPUCOMMANDOP_EVENT_WRITE                        = 0x46,
    GPUCOMMANDOP_ME_INIT                            = 0x48,
    GPUCOMMANDOP_FIX_2_FLT_REG                      = 0x4d,
    GPUCOMMANDOP_MEM_WRITE_COUNTER                  = 0x4f,
    GPUCOMMANDOP_WAIT_REG_EQ                        = 0x52,
    GPUCOMMANDOP_WAIT_REG_GTE                       = 0x53,
    GPUCOMMANDOP_CPU_INTERRUPT                      = 0x54,
    GPUCOMMANDOP_EVENT_WRITE_SHADER                 = 0x58,
    GPUCOMMANDOP_EVENT_WRITE_CACHE_FLUSH            = 0x59,
    GPUCOMMANDOP_EVENT_WRITE_SCREEN_EXTENT          = 0x5a,
    GPUCOMMANDOP_EVENT_WRITE_ZPASS_DONE             = 0x5b,
    GPUCOMMANDOP_CONTEXT_UPDATE                     = 0x5e,
#ifdef XAM_BUILD
    GPUCOMMANDOP_SET_BIN_MASK_LO                    = GPUCOMMANDOP_NOP,
    GPUCOMMANDOP_SET_BIN_MASK_HI                    = GPUCOMMANDOP_NOP,
    GPUCOMMANDOP_SET_BIN_SELECT_LO                  = GPUCOMMANDOP_NOP,
    GPUCOMMANDOP_SET_BIN_SELECT_HI                  = GPUCOMMANDOP_NOP,
#else
    GPUCOMMANDOP_SET_BIN_MASK_LO                    = 0x60,
    GPUCOMMANDOP_SET_BIN_MASK_HI                    = 0x61,
    GPUCOMMANDOP_SET_BIN_SELECT_LO                  = 0x62,
    GPUCOMMANDOP_SET_BIN_SELECT_HI                  = 0x63,
#endif
} GPUCOMMANDOP;

typedef enum
{
    GPULOADTYPE_VERTEX                              = 0,
    GPULOADTYPE_PIXEL                               = 1,
    GPULOADTYPE_SHARED                              = 2,
} GPULOADTYPE;

typedef enum
{
    GPUHIZFUNC_LESS_EQUAL                           = 0,
    GPUHIZFUNC_GREATER_EQUAL                        = 1,
} GPUHIZFUNC;

typedef enum
{
    GPUHISTENCILFUNC_EQUAL                          = 0,
    GPUHISTENCILFUNC_NOT_EQUAL                      = 1,
} GPUHISTENCILFUNC;

typedef enum
{
    GPUEDRAMMODE_NOP                                = 0,
    GPUEDRAMMODE_COLOR_DEPTH                        = 4,
    GPUEDRAMMODE_DOUBLE_DEPTH                       = 5,
    GPUEDRAMMODE_COPY                               = 6,
} GPUEDRAMMODE;

typedef enum
{
    GPUCLIPPLANEMODE_CULL_CENTER_NO_BIAS            = 0,
    GPUCLIPPLANEMODE_CULL_CENTER_RADIUS_BIAS        = 1,
    GPUCLIPPLANEMODE_CULL_CENTER_RADIUS_BIAS_EXPAND = 2,
    GPUCLIPPLANEMODE_ALWAYS_EXPAND                  = 3,
} GPUCLIPPLANEMODE;

typedef enum
{
    GPUSAMPLECONTROL_CENTROIDS_ONLY                 = 0,
    GPUSAMPLECONTROL_CENTERS_ONLY                   = 1,
    GPUSAMPLECONTROL_CENTROIDS_AND_CENTERS          = 2,
} GPUSAMPLECONTROL;

typedef enum
{
    GPUPATHSELECT_VERTEX_REUSE                      = 0,
    GPUPATHSELECT_TESS_ENABLE                       = 1,
    GPUPATHSELECT_PASSTHRU                          = 2,
} GPUPATHSELECT;

typedef enum
{
    GPUTESSMODE_DISCRETE                            = 0,
    GPUTESSMODE_CONTINUOUS                          = 1,
    GPUTESSMODE_ADAPTIVE                            = 2,
} GPUTESSMODE;

typedef enum
{
    GPUPIXCENTER_ZERO                               = 0,
    GPUPIXCENTER_HALF                               = 1,
} GPUPIXCENTER;

typedef enum
{
    GPUROUNDMODE_TRUNCATE                           = 0,
    GPUROUNDMODE_ROUND                              = 1,
    GPUROUNDMODE_ROUND_TO_EVEN                      = 2,
    GPUROUNDMODE_ROUND_TO_ODD                       = 3,
} GPUROUNDMODE;

typedef enum
{
    GPUQUANTMODE_16TH                               = 0,
    GPUQUANTMODE_8TH                                = 1,
    GPUQUANTMODE_4TH                                = 2,
    GPUQUANTMODE_HALF                               = 3,
    GPUQUANTMODE_ONE                                = 4,
} GPUQUANTMODE;

typedef enum
{
    GPUCOPYSRCSELECT_RENDER_TARGET_0                = 0,
    GPUCOPYSRCSELECT_RENDER_TARGET_1                = 1,
    GPUCOPYSRCSELECT_RENDER_TARGET_2                = 2,
    GPUCOPYSRCSELECT_RENDER_TARGET_3                = 3,
    GPUCOPYSRCSELECT_DEPTH_STENCIL                  = 4,
} GPUCOPYSRCSELECT;

typedef enum
{
    GPUCOPYSAMPLESELECT_SAMPLE_0                    = 0,
    GPUCOPYSAMPLESELECT_SAMPLE_1                    = 1,
    GPUCOPYSAMPLESELECT_SAMPLE_2                    = 2,
    GPUCOPYSAMPLESELECT_SAMPLE_3                    = 3,
    GPUCOPYSAMPLESELECT_SAMPLES_0_1                 = 4,
    GPUCOPYSAMPLESELECT_SAMPLES_2_3                 = 5,
    GPUCOPYSAMPLESELECT_SAMPLES_0_1_2_3             = 6,
} GPUCOPYSAMPLESELECT;

typedef enum
{
    GPUCOPYCOMMAND_RAW                              = 0,
    GPUCOPYCOMMAND_CONVERT                          = 1,
    GPUCOPYCOMMAND_1_1_1_1                          = 2,
    GPUCOPYCOMMAND_NULL                             = 3,
} GPUCOPYCOMMAND;

typedef enum
{
    GPUVSEXPORTMODE_POSITION_ONLY                   = 0,
    GPUVSEXPORTMODE_SPRITE                          = 2,
    GPUVSEXPORTMODE_EDGE                            = 3,
    GPUVSEXPORTMODE_KILL                            = 4,
    GPUVSEXPORTMODE_SPRITE_KILL                     = 5,
    GPUVSEXPORTMODE_EDGE_KILL                       = 6,
    GPUVSEXPORTMODE_MULTIPASS                       = 7,
} GPUVSEXPORTMODE;

typedef enum
{
    GPUSAMPLES_1X                                   = 0,
    GPUSAMPLES_2X                                   = 1,
    GPUSAMPLES_4X                                   = 2,
} GPUSAMPLES;

typedef enum
{
    GPUEXECINSTRUCTIONTYPE_ALU                      = 0,
    GPUEXECINSTRUCTIONTYPE_FETCH                    = 1,
} GPUEXECINSTRUCTIONTYPE;

typedef enum
{
    GPUEXECSERIALIZEMODE_UNSERIALIZED               = 0,
    GPUEXECSERIALIZEMODE_SERIALIZED                 = 1,
} GPUEXECSERIALIZEMODE;

typedef enum
{
    GPUEXECFETCHCACHETYPE_TEXTURE                   = 0,
    GPUEXECFETCHCACHETYPE_VERTEX                    = 1,
} GPUEXECFETCHCACHETYPE;

typedef enum
{
    GPUALLOCBUFFERSELECT_POSITION                   = 1,
    GPUALLOCBUFFERSELECT_INTERPOLATORS              = 2, // for vertex shaders
    GPUALLOCBUFFERSELECT_COLORS                     = 2, // for pixel shaders
    GPUALLOCBUFFERSELECT_EXPORT                     = 3
} GPUALLOCBUFFERSELECT;

typedef enum
{
    GPUSCREENEXTENTS_ALLTILES                       = 0,
    GPUSCREENEXTENTS_NONCULLED                      = 1,
    GPUSCREENEXTENTS_PRIMEXTENTS                    = 2,
} GPUSCREENEXTENTS;

typedef enum
{
    GPUINITIATOR_VS_DEALLOC                         = 0,
    GPUINITIATOR_PS_DEALLOC                         = 1,
    GPUINITIATOR_VS_DONE_WRITE_BACK                 = 2,
    GPUINITIATOR_PS_DONE_WRITE_BACK                 = 3,
    GPUINITIATOR_CACHE_FLUSH_WRITE_BACK             = 4,
    GPUINITIATOR_CONTEXT_DONE                       = 5,
    GPUINITIATOR_CACHE_FLUSH                        = 6,
    GPUINITIATOR_VIZQUERY_START                     = 7,
    GPUINITIATOR_VIZQUERY_END                       = 8,
    GPUINITIATOR_SC_WAIT_WC                         = 9,
    GPUINITIATOR_MPASS_PS_CP_REFETCH                = 10,
    GPUINITIATOR_MPASS_PS_RST_START                 = 11,
    GPUINITIATOR_MPASS_PS_INCR_START                = 12,
    GPUINITIATOR_RST_PIX_CNT                        = 13,
    GPUINITIATOR_RST_VTX_CNT                        = 14,
    GPUINITIATOR_TILE_FLUSH                         = 15,
    GPUINITIATOR_CACHE_FLUSH_AND_INV_WRITE_BACK_EVENT = 20,
    GPUINITIATOR_ZPASS_DONE                         = 21,
    GPUINITIATOR_CACHE_FLUSH_AND_INV_EVENT          = 22,
    GPUINITIATOR_PERFCOUNTER_START                  = 23,
    GPUINITIATOR_PERFCOUNTER_STOP                   = 24,
    GPUINITIATOR_SCREEN_EXT_INIT                    = 25,
    GPUINITIATOR_SCREEN_EXT_RPT                     = 26,
    GPUINITIATOR_VS_FETCH_DONE_WRITE_BACK           = 27,
} GPUINITIATOR;

typedef enum
{
    GPUSYNCFUNCTION_NEVER                           = 0,
    GPUSYNCFUNCTION_LESS                            = 1,
    GPUSYNCFUNCTION_LEQUAL                          = 2,
    GPUSYNCFUNCTION_EQUAL                           = 3,
    GPUSYNCFUNCTION_NOTEQUAL                        = 4,
    GPUSYNCFUNCTION_GEQUAL                          = 5,
    GPUSYNCFUNCTION_GREATER                         = 6,
    GPUSYNCFUNCTION_ALWAYS                          = 7,
} GPUSYNCFUNCTION;

typedef enum
{
    GPUSYNCSPACE_REGISTER                           = 0,
    GPUSYNCSPACE_MEMORY                             = 1,
} GPUSYNCSPACE;

typedef enum
{
    GPUCONSTANTID_ALU                               = 0,
    GPUCONSTANTID_FETCH                             = 1,
    GPUCONSTANTID_BOOLEAN                           = 2,
    GPUCONSTANTID_INTEGER                           = 3,
    GPUCONSTANTID_REGISTER                          = 4,
} GPUCONSTANTID;

typedef enum
{
    GPUINDEXSELECT_DMA                              = 0,
    GPUINDEXSELECT_IMMEDIATE                        = 1,
    GPUINDEXSELECT_AUTO                             = 2,
} GPUINDEXSELECT;

typedef enum
{
    GPUINDEXTYPE_16BIT                              = 0,
    GPUINDEXTYPE_32BIT                              = 1,
} GPUINDEXTYPE;

typedef enum
{
    GPUDESTBASE7_CLIPPLANE_DISABLED                 = 0,
    GPUDESTBASE7_CLIPPLANE_ENABLED                  = 0X1000,
} GPUDESTBASE7;

//------------------------------------------------------------------------------

typedef struct
{
    DWORD Total[2];         // Odd/even pair, little-endian
    DWORD ZFail[2];         // Odd/even pair, little-endian
    DWORD ZPass[2];         // Odd/even pair, little-endian
    DWORD StencilFail[2];   // Odd/even pair, little-endian
} GPU_SAMPLECOUNT;

typedef struct
{
    WORD MinX;              // In tile coordinates, multiply by 8 for pixel coordinates
    WORD MaxX;              // In tile coordinates, multiply by 8 for pixel coordinates
    WORD MinY;              // In tile coordinates, multiply by 8 for pixel coordinates
    WORD MaxY;              // In tile coordinates, multiply by 8 for pixel coordinates
    WORD MinZ;              // 11 MSBs of Z
    WORD MaxZ;              // 11 MSBs of Z
} GPU_SCREENEXTENT;

//------------------------------------------------------------------------------

typedef union {
    struct {
        DWORD SurfacePitch                          : 14;
        DWORD                                       : 2;
        DWORD MsaaSamples                           : 2;    // GPUSAMPLES
        DWORD HiZPitch                              : 14;
    };
    DWORD dword;
} GPU_SURFACEINFO;

typedef union {
    struct {
        DWORD ColorBase                             : 12;
        DWORD                                       : 4;
        DWORD ColorFormat                           : 4;    // GPUEDRAMCOLORFORMAT
        DWORD ColorExpBias                          : 6;
        DWORD                                       : 6;
    };
    DWORD dword;
} GPU_COLORINFO;

typedef union {
    struct {
        DWORD DepthBase                             : 12;
        DWORD                                       : 4;
        DWORD DepthFormat                           : 1;    // GPUEDRAMDEPTHFORMAT
        DWORD DisableHZClamp                        : 1;
        DWORD                                       : 14;
    };
    DWORD dword;
} GPU_DEPTHINFO;

typedef union {
    struct {
        int X                                       : 15;
        DWORD                                       : 1;
        int Y                                       : 15;
        DWORD                                       : 1;
    };
    DWORD dword;
} GPU_POINT;

typedef union {
    struct {
        DWORD Write0                                : 4;
        DWORD Write1                                : 4;
        DWORD Write2                                : 4;
        DWORD Write3                                : 4;
        DWORD                                       : 16;
    };
    DWORD dword;
} GPU_COLORMASK;

typedef union {
    struct {
        DWORD Ref                                   : 8;
        DWORD Mask                                  : 8;
        DWORD WriteMask                             : 8;
        DWORD                                       : 8;
    };
    DWORD dword;
} GPU_STENCILREFMASK;

typedef union {
    struct {
        DWORD VsMaxReg                              : 6;
        DWORD                                       : 2;
        DWORD PsMaxReg                              : 6;
        DWORD                                       : 2;
        DWORD VsResource                            : 1;
        DWORD PsResource                            : 1;
        DWORD ParamGen                              : 1;
        DWORD GenIndexPix                           : 1;
        DWORD VsExportCount                         : 4;
        DWORD VsExportMode                          : 3;    // GPUVSEXPORTMODE
        DWORD PsExportZ                             : 1;
        DWORD PsExportColorCount                    : 3;
        DWORD GenIndexVtx                           : 1;
    };
    DWORD dword;
} GPU_PROGRAMCONTROL;

#define GPU_PROGRAMCONTROL_PS_MASK 0x780E3F00
#define GPU_PROGRAMCONTROL_VS_MASK (~GPU_PROGRAMCONTROL_PS_MASK)

typedef union {
    struct {
        DWORD InstPredOptimize                      : 1;
        DWORD OutputScreenXY                        : 1;
        DWORD SampleControl                         : 2;    // GPUSAMPLECONTROL
        DWORD                                       : 4;
        DWORD ParamGenPos                           : 8;
        DWORD PerfCounterRef                        : 1;
        DWORD YieldOptimize                         : 1;
        DWORD TxCacheSelect                         : 1;
        DWORD                                       : 13;
    };
    DWORD dword;
} GPU_CONTEXTMISC;

#define GPU_CONTEXTMISC_PS_MASK 0x0000FFE
#define GPU_CONTEXTMISC_VS_MASK (~GPU_CONTEXTMISC_PS_MASK)

typedef union {
    struct {
        DWORD ParamShade                            : 16;
        DWORD SamplingPattern                       : 16;
    };
    DWORD dword;
} GPU_INTERPOLATORCONTROL;

typedef union {
    struct {
        DWORD Wrap0                                 : 4;
        DWORD Wrap1                                 : 4;
        DWORD Wrap2                                 : 4;
        DWORD Wrap3                                 : 4;
        DWORD Wrap4                                 : 4;
        DWORD Wrap5                                 : 4;
        DWORD Wrap6                                 : 4;
        DWORD Wrap7                                 : 4;
    };
    DWORD dword;
} GPU_WRAPPING1;

typedef union {
    struct {
        DWORD Wrap8                                 : 4;
        DWORD Wrap9                                 : 4;
        DWORD Wrap10                                : 4;
        DWORD Wrap11                                : 4;
        DWORD Wrap12                                : 4;
        DWORD Wrap13                                : 4;
        DWORD Wrap14                                : 4;
        DWORD Wrap15                                : 4;
    };
    DWORD dword;
} GPU_WRAPPING0;

typedef union {
    struct {
        DWORD StencilEnable                         : 1;
        DWORD ZEnable                               : 1;
        DWORD ZWriteEnable                          : 1;
        DWORD                                       : 1;
        DWORD ZFunc                                 : 3;    // GPUCMPFUNC
        DWORD BackFaceEnable                        : 1;
        DWORD StencilFunc                           : 3;    // GPUCMPFUNC
        DWORD StencilFail                           : 3;    // GPUSTENCILOP
        DWORD StencilZPass                          : 3;    // GPUSTENCILOP
        DWORD StencilZFail                          : 3;    // GPUSTENCILOP
        DWORD StencilFuncBF                         : 3;    // GPUCMPFUNC
        DWORD StencilFailBF                         : 3;    // GPUSTENCILOP
        DWORD StencilZPassBF                        : 3;    // GPUSTENCILOP
        DWORD StencilZFailBF                        : 3;    // GPUSTENCILOP
    };
    DWORD dword;
} GPU_DEPTHCONTROL;

typedef union {
    struct {
        DWORD ColorSrcBlend                         : 5;    // GPUBLEND
        DWORD ColorBlendOp                          : 3;    // GPUBLENDOP
        DWORD ColorDestBlend                        : 5;    // GPUBLEND
        DWORD                                       : 3;
        DWORD AlphaSrcBlend                         : 5;    // GPUBLEND
        DWORD AlphaBlendOp                          : 3;    // GPUBLENDOP
        DWORD AlphaDestBlend                        : 5;    // GPUBLEND
        DWORD                                       : 3;
    };
    DWORD dword;
} GPU_BLENDCONTROL;

typedef union {
    struct {
        DWORD AlphaFunc                             : 3;    // GPUCMPFUNC
        DWORD AlphaTestEnable                       : 1;
        DWORD AlphaToMaskEnable                     : 1;
        DWORD                                       : 19;
        DWORD AlphaToMaskOffset0                    : 2;
        DWORD AlphaToMaskOffset1                    : 2;
        DWORD AlphaToMaskOffset2                    : 2;
        DWORD AlphaToMaskOffset3                    : 2;
    };
    DWORD dword;
} GPU_COLORCONTROL;

typedef union {
    struct {
        DWORD HiZWriteEnable                        : 1;
        DWORD HiZEnable                             : 1;
        DWORD HiStencilWriteEnable                  : 1;
        DWORD HiStencilEnable                       : 1;
        DWORD HiZFunc                               : 1;    // GPUHIZFUNC
        DWORD HiStencilFunc                         : 1;    // GPUHISTENCILFUNC
        DWORD                                       : 2;
        DWORD HiStencilRef                          : 8;
        DWORD                                       : 1;
        DWORD HiBaseAddr                            : 15;
    };
    DWORD dword;
} GPU_HICONTROL;

typedef union {
    struct {
        DWORD ClipPlaneEnable0                      : 1;
        DWORD ClipPlaneEnable1                      : 1;
        DWORD ClipPlaneEnable2                      : 1;
        DWORD ClipPlaneEnable3                      : 1;
        DWORD ClipPlaneEnable4                      : 1;
        DWORD ClipPlaneEnable5                      : 1;
        DWORD                                       : 8;
        DWORD ClipPlaneMode                         : 2;    // GPUCLIPPLANEMODE
        DWORD ClipDisable                           : 1;
        DWORD ClipPlaneCullOnlyEnable               : 1;
        DWORD BoundaryEdgeFlagEnable                : 1;
        DWORD DxClipSpaceDef                        : 1;
        DWORD DisableClipErrDetect                  : 1;
        DWORD VtxKillOr                             : 1;
        DWORD XyNanRetain                           : 1;
        DWORD ZNanRetain                            : 1;
        DWORD WNanRetain                            : 1;
        DWORD                                       : 7;
    };
    DWORD dword;
} GPU_CLIPCONTROL;

typedef union {
    struct {
        DWORD CullMode                              : 3;    // GPUCULL
        DWORD PolyMode                              : 2;    // BOOL
        DWORD PolyModeFrontPType                    : 3;    // GPUFILLMODE
        DWORD PolyModeBackPType                     : 3;    // GPUFILLMODE
        DWORD PolyOffsetFrontEnable                 : 1;
        DWORD PolyOffsetBackEnable                  : 1;
        DWORD PolyOffsetParaEnable                  : 1;
        DWORD                                       : 1;
        DWORD MsaaEnable                            : 1;
        DWORD VtxWindowOffsetEnable                 : 1;
        DWORD                                       : 2;
        DWORD ProvokingVtxLast                      : 1;
        DWORD PerspCorrDisable                      : 1;
        DWORD MultiPrimIbEnable                     : 1;
        DWORD                                       : 1;
        DWORD QuadOrderEnable                       : 1;
        DWORD ScOneQuadPerClock                     : 1;
        DWORD                                       : 7;
    };
    DWORD dword;
} GPU_MODECONTROL;

typedef union {
    struct {
        DWORD VportXScaleEnable                     : 1;
        DWORD VportXOffsetEnable                    : 1;
        DWORD VportYScaleEnable                     : 1;
        DWORD VportYOffsetEnable                    : 1;
        DWORD VportZScaleEnable                     : 1;
        DWORD VportZOffsetEnable                    : 1;
        DWORD                                       : 2;
        DWORD VtxXyFmt                              : 1;
        DWORD VtxZFmt                               : 1;
        DWORD VtxW0Fmt                              : 1;
        DWORD PerfCounterRef                        : 1;
        DWORD                                       : 20;
    };
    DWORD dword;
} GPU_VTECONTROL;

typedef union {
    struct {
        DWORD EdramMode                             : 3;    // GPUEDRAMMODE
        DWORD ColorDepthMacro                       : 1;
        DWORD                                       : 28;
    };
    DWORD dword;
} GPU_EDRAMMODECONTROL;

typedef union {
    struct {
        DWORD Height                                : 16;
        DWORD Width                                 : 16;
    };
    DWORD dword;
} GPU_POINTSIZE;

typedef union {
    struct {
        DWORD MinSize                               : 16;
        DWORD MaxSize                               : 16;
    };
    DWORD dword;
} GPU_POINTMINMAX;

typedef union {
    struct {
        DWORD Width                                 : 16;
        DWORD                                       : 16;
    };
    DWORD dword;
} GPU_LINECONTROL;

typedef union {
    struct {
        DWORD PathSelect                            : 2;    // GPUPATHSELECT
        DWORD                                       : 30;
    };
    DWORD dword;
} GPU_OUTPUTPATHCONTROL;

typedef union {
    struct {
        DWORD TessMode                              : 2;    // GPUTESSMODE
        DWORD                                       : 30;
    };
    DWORD dword;
} GPU_HOSCONTROL;

typedef union {
    struct {
        DWORD GroupPrimType                         : 4;    // GPUGROUPPRIMTYPE
        DWORD                                       : 10;
        DWORD GroupRetainOrder                      : 1;
        DWORD GroupRetainQuads                      : 1;
        DWORD GroupPrimOrder                        : 3;    // GPUGROUPPRIMORDER
        DWORD                                       : 13;
    };
    DWORD dword;
} GPU_GROUPPRIMTYPE;

typedef union {
    struct {
        DWORD CompXEnable                           : 1;
        DWORD CompYEnable                           : 1;
        DWORD CompZEnable                           : 1;
        DWORD CompWEnable                           : 1;
        DWORD                                       : 4;
        DWORD Stride                                : 8;
        DWORD Shift                                 : 8;
        DWORD                                       : 8;
    };
    DWORD dword;
} GPU_GROUPVECTCONTROL;

typedef union {
    struct {
        DWORD XConv                                 : 4;    // GPUGROUPCONV
        DWORD XOffset                               : 4;
        DWORD YConv                                 : 4;    // GPUGROUPCONV
        DWORD YOffset                               : 4;
        DWORD ZConv                                 : 4;    // GPUGROUPCONV
        DWORD ZOffset                               : 4;
        DWORD WConv                                 : 4;    // GPUGROUPCONV
        DWORD WOffset                               : 4;
    };
    DWORD dword;
} GPU_GROUPVECTFMTCONTROL;

typedef union {
    struct {
        DWORD MPassPixVecPerPass                    : 20;
        DWORD                                       : 11;
        DWORD MPassPsEnable                         : 1;
    };
    DWORD dword;
} GPU_MPASSPSCONTROL;

typedef union {
    struct {
        DWORD VizQueryEnable                        : 1;
        DWORD VizQueryId                            : 6;
        DWORD KillPixPostHiZ                        : 1;
        DWORD KillPixPostDetailMask                 : 1;
        DWORD                                       : 23;
    };
    DWORD dword;
} GPU_VIZQUERY;

typedef union {
    struct {
        DWORD Misc                                  : 16;
        DWORD                                       : 16;
    };
    DWORD dword;
} GPU_ENHANCE;

typedef union {
    struct {
        DWORD BresControl                           : 8;
        DWORD UseBresControl                        : 1;
        DWORD ExpandLineWidth                       : 1;
        DWORD LastPixel                             : 1;
        DWORD                                       : 21;
    };
    DWORD dword;
} GPU_SCLINECONTROL;

typedef union {
    struct {
        DWORD MsaaNumSamples                        : 3;
        DWORD                                       : 10;
        DWORD MaxSampleDist                         : 4;
        DWORD                                       : 15;
    };
    DWORD dword;
} GPU_AACONFIG;

typedef union {
    struct {
        DWORD PixCenter                             : 1;    // GPUPIXCENTER
        DWORD RoundMode                             : 2;    // GPUROUNDMODE
        DWORD QuantMode                             : 3;    // GPUQUANTMODE
        DWORD                                       : 26;
    };
    DWORD dword;
} GPU_VTXCONTROL;

typedef union {
    struct {
        DWORD Base                                  : 9;
        DWORD                                       : 3;
        DWORD Size                                  : 9;
        DWORD                                       : 11;
    };
    DWORD dword;
} GPU_CONST;

typedef union {
    struct {
        DWORD DbProgOn                              : 1;
        DWORD                                       : 3;
        DWORD DbProbBreak                           : 1;
        DWORD                                       : 3;
        DWORD DbProbAddr                            : 11;
        DWORD                                       : 5;
        DWORD DbProbCount                           : 8;
    };
    DWORD dword;
} GPU_DEBUGMISC0;

typedef union {
    struct {
        DWORD DbOnPix                               : 1;
        DWORD DbOnVtx                               : 1;
        DWORD                                       : 6;
        DWORD DbInstCount                           : 8;
        DWORD DbBreakAddr                           : 11;
        DWORD                                       : 5;
    };
    DWORD dword;
} GPU_DEBUGMISC1;

typedef union {
    struct {
        DWORD VtxReuseDepth                         : 8;
        DWORD                                       : 24;
    };
    DWORD dword;
} GPU_VERTEXREUSEBLOCKCONTROL;

typedef union {
    struct {
        DWORD DeallocDist                           : 7;
        DWORD                                       : 25;
    };
    DWORD dword;
} GPU_OUTDEALLOCCONTROL;

typedef union {
    struct {
        DWORD CopySrcSelect                         : 3;    // GPUCOPYSRCSELECT
        DWORD                                       : 1;
        DWORD CopySampleSelect                      : 3;    // GPUCOPYSAMPLESELECT
        DWORD                                       : 1;
        DWORD ColorClearEnable                      : 1;
        DWORD DepthClearEnable                      : 1;
        DWORD                                       : 10;
        DWORD CopyCommand                           : 2;    // GPUCOPYCOMMAND
        DWORD                                       : 10;
    };
    DWORD dword;
} GPU_COPYCONTROL;

typedef union {
    struct {
        DWORD CopyDestPitch                         : 14;
        DWORD                                       : 2;
        DWORD CopyDestHeight                        : 14;
        DWORD                                       : 2;
    };
    DWORD dword;
} GPU_COPYDESTPITCH;

typedef union {
    struct {
        DWORD CopyDestEndian                        : 3;    // GPUENDIAN128
        DWORD CopyDestArray                         : 1;    // GPUCOLORARRAY
        DWORD CopyDestSlice                         : 3;
        DWORD CopyDestFormat                        : 6;    // GPUCOLORFORMAT
        DWORD CopyDestNumber                        : 3;    // GPUSURFACENUMBER
        DWORD CopyDestExpBias                       : 6;
        DWORD                                       : 2;
        DWORD CopyDestSwap                          : 1;    // GPUSURFACESWAP
        DWORD                                       : 7;
    };
    DWORD dword;
} GPU_COPYDESTINFO;

typedef union {
    struct {
        DWORD CopyFuncRed                           : 3;
        DWORD                                       : 1;
        DWORD CopyFuncGreen                         : 3;
        DWORD                                       : 1;
        DWORD CopyFuncBlue                          : 3;
        DWORD                                       : 1;
        DWORD CopyFuncAlpha                         : 3;
        DWORD                                       : 17;
    };
    DWORD dword;
} GPU_COPYFUNC;

typedef union {
    struct {
        DWORD CopyRefRed                            : 8;
        DWORD CopyRefGreen                          : 8;
        DWORD CopyRefBlue                           : 8;
        DWORD CopyRefAlpha                          : 8;
    };
    DWORD dword;
} GPU_COPYREF;

typedef union {
    struct {
        DWORD CopyMaskRed                           : 8;
        DWORD CopyMaskGreen                         : 8;
        DWORD CopyMaskBlue                          : 8;
        DWORD CopyMaskAlpha                         : 8;
    };
    DWORD dword;
} GPU_COPYMASK;

typedef union {
    struct {
        DWORD ResetSampleCount                      : 1;
        DWORD CopySampleCount                       : 1;
        DWORD                                       : 30;
    };
    DWORD dword;
} GPU_SAMPLECOUNTCONTROL;

typedef union {
    struct {
        DWORD BankActToActSClk                      : 6;
        DWORD                                       : 2;
        DWORD DisableFragCombine                    : 1;
        DWORD DisableReOrder                        : 1;
        DWORD HzFudgeShift                          : 2;
        DWORD ScreenExtMethod                       : 2;    // GPUSCREENEXTENTS
        DWORD                                       : 18;
    };
    DWORD dword;
} GPU_BCCONTROL;

typedef union {
    struct {
        DWORD MatchingContexts                      : 8;
        DWORD CopyDestBaseEnable                    : 1;
        DWORD DestBase0Enable                       : 1;
        DWORD DestBase1Enable                       : 1;
        DWORD DestBase2Enable                       : 1;
        DWORD DestBase3Enable                       : 1;
        DWORD DestBase4Enable                       : 1;
        DWORD DestBase5Enable                       : 1;
        DWORD DestBase6Enable                       : 1;
        DWORD DestBase7Enable                       : 1;
        DWORD                                       : 7;
        DWORD VcActionEnable                        : 1;
        DWORD TcActionEnable                        : 1;
        DWORD PglbActionEnable                      : 1;
        DWORD                                       : 4;
        DWORD Status                                : 1;
    };
    DWORD dword;
} GPU_COHERSTATUS;

typedef union {
    struct {
        DWORD WaitCrtcPFlip                         : 1;
        DWORD WaitReCrtcVLine                       : 1;
        DWORD WaitFeCrtcVLine                       : 1;
        DWORD WaitCrtcVLine                         : 1;
        DWORD                                       : 4;
        DWORD WaitCpDmaIdle                         : 1;
        DWORD                                       : 1;
        DWORD WaitCmdFifo                           : 1;
        DWORD WaitOvFlip                            : 1;
        DWORD                                       : 3;
        DWORD WaitIdle                              : 1;
        DWORD                                       : 1;
        DWORD WaitIdleClean                         : 1;
        DWORD                                       : 2;
        DWORD CmdFifoEntries                        : 4;
        DWORD                                       : 8;
    };
    DWORD dword;
} GPU_WAITUNTIL;

typedef union {
    struct {
        DWORD Cpu0Ack                               : 1;
        DWORD Cpu1Ack                               : 1;
        DWORD Cpu2Ack                               : 1;
        DWORD Cpu3Ack                               : 1;
        DWORD Cpu4Ack                               : 1;
        DWORD Cpu5Ack                               : 1;
        DWORD                                       : 26;
    };
    DWORD dword;
} GPU_CPUINTERRUPTACK;

typedef union {
    struct {
        DWORD Dynamic                               : 1;
        DWORD                                       : 3;
        DWORD PixelSize                             : 7;
        DWORD                                       : 1;
        DWORD VertexSize                            : 7;
        DWORD                                       : 13;
    };
    DWORD dword;
} GPU_GPRMANAGEMENT;

typedef union {
    struct {
        DWORD PixelBase                             : 16;
        DWORD VertexBase                            : 16;
    };
    DWORD dword;
} GPU_INSTSTOREMANAGEMENT;

typedef union {
    struct {
        DWORD Flags                                     ; // DWORD 0  must be 0x3FF
        DWORD Reserved1                                 ; // DWORD 1  must be 0
        DWORD Reserved2                                 ; // DWORD 2  must be 0
        union {
            struct {
                DWORD Reserved3                         ; // DWORD 3  must be 0
                DWORD Reserved4                         ; // DWORD 4  must be 0x80
                DWORD Reserved5                         ; // DWORD 5  must be 0x100
                DWORD Reserved6                         ; // DWORD 6  must be 0x180
                DWORD Reserved7                         ; // DWORD 7  must be 0x200
                DWORD Reserved8                         ; // DWORD 8  must be 0x280
                DWORD Reserved9                         ; // DWORD 9  must be 0x300
                DWORD Reserved10                        ; // DWORD 10 must be 0x380
            };
            DWORD Reserved3To10[8]                      ; // DWORD 3-10
        };
        union {
            struct {
                DWORD PixelBase                     : 16; // DWORD 11
                DWORD VertexBase                    : 16; // DWORD 11
            };
            GPU_INSTSTOREMANAGEMENT InstructionStore    ;
        };
        DWORD MaxContext                                ; // DWORD 12 must be 7
        DWORD Reserved13                                ; // DWORD 13 must be 0
        DWORD Reserved14                                ; // DWORD 14 must be 0
        DWORD Reserved15                                ; // DWORD 15 must be 0
        DWORD Reserved16                                ; // DWORD 16 must be 0
        DWORD Reserved17                                ; // DWORD 17 must be 0
    };
    DWORD dword[18];
} GPU_MEINIT;

//------------------------------------------------------------------------------

// Packet 0:

typedef struct
{
    GPU_SURFACEINFO             SurfaceInfo;            // RB_SURFACE_INFO
    GPU_COLORINFO               Color0Info;             // RB_COLOR0_INFO
    GPU_DEPTHINFO               DepthInfo;              // RB_DEPTH_INFO
    GPU_COLORINFO               Color1Info;             // RB_COLOR1_INFO
    GPU_COLORINFO               Color2Info;             // RB_COLOR2_INFO
    GPU_COLORINFO               Color3Info;             // RB_COLOR3_INFO
    DWORD                       CoherDestBase0;         // COHER_DEST_BASE_0
    DWORD                       CoherDestBase1;         // COHER_DEST_BASE_1

    DWORD                       CoherDestBase2;         // COHER_DEST_BASE_2
    DWORD                       CoherDestBase3;         // COHER_DEST_BASE_3
    DWORD                       CoherDestBase4;         // COHER_DEST_BASE_4
    DWORD                       CoherDestBase5;         // COHER_DEST_BASE_5
    DWORD                       CoherDestBase6;         // COHER_DEST_BASE_6
    DWORD                       CoherDestBase7;         // COHER_DEST_BASE_7
    GPU_POINT                   ScreenScissorTL;        // PA_SC_SCREEN_SCISSOR_TL
    GPU_POINT                   ScreenScissorBR;        // PA_SC_SCREEN_SCISSOR_BR
} GPU_DESTINATIONPACKET;

// Packet 1:

typedef struct
{
    GPU_POINT                   WindowOffset;           // PA_SC_WINDOW_OFFSET
    GPU_POINT                   WindowScissorTL;        // PA_SC_WINDOW_SCISSOR_TL
    GPU_POINT                   WindowScissorBR;        // PA_SC_WINDOW_SCISSOR_BR
} GPU_WINDOWPACKET;

// Packet 2:

typedef struct
{
    DWORD                       MaxVtxIndx;             // VGT_MAX_VTX_INDX
    DWORD                       MinVtxIndx;             // VGT_MIN_VTX_INDX
    DWORD                       IndxOffset;             // VGT_INDX_OFFSET
    DWORD                       MultiPrimIbResetIndx;   // VGT_MULTI_PRIM_IB_RESET_INDX
    GPU_COLORMASK               ColorMask;              // RB_COLOR_MASK
    float                       BlendRed;               // RB_BLEND_RED
    float                       BlendGreen;             // RB_BLEND_GREEN
    float                       BlendBlue;              // RB_BLEND_BLUE

    float                       BlendAlpha;             // RB_BLEND_ALPHA
    DWORD                       Unused[3];
    GPU_STENCILREFMASK          StencilRefMaskBF;       // RB_STENCILREFMASK_BF
    GPU_STENCILREFMASK          StencilRefMask;         // RB_STENCILREFMASK
    float                       AlphaRef;               // RB_ALPHA_REF
    float                       VportXScale;            // PA_CL_VPORT_XSCALE

    float                       VportXOffset;           // PA_CL_VPORT_XOFFSET
    float                       VportYScale;            // PA_CL_VPORT_YSCALE
    float                       VportYOffset;           // PA_CL_VPORT_YOFFSET
    float                       VportZScale;            // PA_CL_VPORT_ZSCALE
    float                       VportZOffset;           // PA_CL_VPORT_ZOFFSET
} GPU_VALUESPACKET;

// Packet 3:

typedef struct
{
    GPU_PROGRAMCONTROL          ProgramControl;         // SQ_PROGRAM_CNTL
    GPU_CONTEXTMISC             ContextMisc;            // SQ_CONTEXT_MISC
    GPU_INTERPOLATORCONTROL     InterpolatorControl;    // SQ_INTERPOLATOR_CNTL
    GPU_WRAPPING0               Wrapping0;              // SQ_WRAPPING_0
    GPU_WRAPPING1               Wrapping1;              // SQ_WRAPPING_1
} GPU_PROGRAMPACKET;

// Packet 4:

typedef struct
{
    GPU_DEPTHCONTROL            DepthControl;           // RB_DEPTHCONTROL
    GPU_BLENDCONTROL            BlendControl0;          // RB_BLENDCONTROL0
    GPU_COLORCONTROL            ColorControl;           // RB_COLORCONTROL
    GPU_HICONTROL               HiControl;              // RB_TILECONTROL
    GPU_CLIPCONTROL             ClipControl;            // PA_CL_CLIP_CNTL
    GPU_MODECONTROL             ModeControl;            // PA_SU_SC_MODE_CNTL
    GPU_VTECONTROL              VteControl;             // PA_CL_VTE_CNTL
    DWORD                       Unused;

    GPU_EDRAMMODECONTROL        EdramModeControl;       // RB_MODECONTROL
    GPU_BLENDCONTROL            BlendControl1;          // RB_BLENDCONTROL1
    GPU_BLENDCONTROL            BlendControl2;          // RB_BLENDCONTROL2
    GPU_BLENDCONTROL            BlendControl3;          // RB_BLENDCONTROL3
} GPU_CONTROLPACKET;

// Packet 5:

typedef struct
{
    GPU_POINTSIZE               PointSize;              // PA_SU_POINT_SIZE
    GPU_POINTMINMAX             PointMinMax;            // PA_SU_POINT_MINMAX
    GPU_LINECONTROL             LineControl;            // PA_SU_LINE_CNTL
    DWORD                       Unused1;
    GPU_OUTPUTPATHCONTROL       OutputPathControl;      // VGT_OUTPUT_PATH_CNTL
    GPU_HOSCONTROL              HosControl;             // VGT_HOS_CNTL
    float                       HosMaxTessLevel;        // VGT_HOS_MAX_TESS_LEVEL
    float                       HosMinTessLevel;        // VGT_HOS_MIN_TESS_LEVEL

    DWORD                       HosReuseDepth;          // VGT_HOS_REUSE_DEPTH
    GPU_GROUPPRIMTYPE           GroupPrimType;          // VGT_GROUP_PRIM_TYPE
    DWORD                       GroupFirstDecr;         // VGT_GROUP_FIRST_DECR
    DWORD                       GroupDecr;              // VGT_GROUP_DECR
    GPU_GROUPVECTCONTROL        GroupVect0Control;      // VGT_GROUP_VECT_0_CNTL
    GPU_GROUPVECTCONTROL        GroupVect1Control;      // VGT_GROUP_VECT_1_CNTL
    GPU_GROUPVECTFMTCONTROL     GroupVect0FmtControl;   // VGT_GROUP_VECT_0_FMT_CNTL
    GPU_GROUPVECTFMTCONTROL     GroupVect1FmtControl;   // VGT_GROUP_VECT_1_FMT_CNTL

    DWORD                       Unused2[2];
    GPU_MPASSPSCONTROL          MPassPsControl;         // PA_SC_MPASS_PS_CNTL
    GPU_VIZQUERY                VizQuery;               // PA_SC_VIZ_QUERY
    GPU_ENHANCE                 Enhance;                // VGT_ENHANCE
} GPU_TESSELLATORPACKET;

// Packet 6:

typedef struct
{
    GPU_SCLINECONTROL           ScLineControl;          // PA_SC_LINE_CNTL
    GPU_AACONFIG                AaConfig;               // PA_SC_AA_CONFIG
    GPU_VTXCONTROL              VtxControl;             // PA_SU_VTX_CNTL
    float                       GbVertClipAdj;          // PA_CL_GB_VERT_CLIP_ADJ
    float                       GbVertDiscAdj;          // PA_CL_GB_VERT_DISC_ADJ
    float                       GbHorzClipAdj;          // PA_CL_GB_HORZ_CLIP_ADJ
    float                       GbHorzDiscAdj;          // PA_CL_GB_HORZ_DISC_ADJ
    GPU_CONST                   VsConst;                // SQ_VS_CONST

    GPU_CONST                   PsConst;                // SQ_PS_CONST
    GPU_DEBUGMISC0              DebugMisc0;             // SQ_DEBUG_MISC_0
    GPU_DEBUGMISC1              DebugMisc1;             // SQ_DEBUG_MISC_1
    DWORD                       Unused1[5];

    DWORD                       Unused2[2];
    DWORD                       AaMask;                 // PA_SC_AA_MASK
    DWORD                       Unused3[3];
    GPU_VERTEXREUSEBLOCKCONTROL VertexReuseBlockControl;// VGT_VERTEX_REUSE_BLOCK_CNTL
    GPU_OUTDEALLOCCONTROL       OutDeallocControl;      // VGT_OUT_DEALLOC_CNTL

    GPU_COPYCONTROL             CopyControl;            // RB_COPY_CONTROL
    DWORD                       CopyDestBase;           // RB_COPY_DEST_BASE
    GPU_COPYDESTPITCH           CopyDestPitch;          // RB_COPY_DEST_PITCH
    GPU_COPYDESTINFO            CopyDestInfo;           // RB_COPY_DEST_INFO
    DWORD                       HiClear;                // RB_TILE_CLEAR
    DWORD                       DepthClear;             // RB_DEPTH_CLEAR
    DWORD                       ColorClear;             // RB_COLOR_CLEAR
    DWORD                       ColorClearLo;           // RB_COLOR_CLEAR_LO

    GPU_COPYFUNC                CopyFunc;               // RB_COPY_FUNC
    GPU_COPYREF                 CopyRef;                // RB_COPY_REF
    GPU_COPYMASK                CopyMask;               // RB_COPY_MASK
    DWORD                       CopySurfaceSlice;       // RB_COPY_SURFACE_SLICE
    GPU_SAMPLECOUNTCONTROL      SampleCountControl;     // RB_SAMPLE_COUNT_CTL
    DWORD                       SampleCountAddress;     // RB_SAMPLE_COUNT_ADDR
} GPU_MISCPACKET;

// Packet 7:

typedef struct
{
    float                       PolyOffsetFrontScale;   // PA_SU_POLY_OFFSET_FRONT_SCALE
    float                       PolyOffsetFrontOffset;  // PA_SU_POLY_OFFSET_FRONT_OFFSET
    float                       PolyOffsetBackScale;    // PA_SU_POLY_OFFSET_BACK_SCALE
    float                       PolyOffsetBackOffset;   // PA_SU_POLY_OFFSET_BACK_OFFSET
    float                       PointXRad;              // PA_CL_POINT_X_RAD
    float                       PointYRad;              // PA_CL_POINT_Y_RAD
    float                       PointConstantSize;      // PA_CL_POINT_SIZE
    float                       PointCullRad;           // PA_CL_POINT_CULL_RAD
} GPU_POINTPACKET;

//------------------------------------------------------------------------------

// Number of packets:

#define GPU_PACKET_COUNT                8

// Maximum size of every packet, in registers:

#define GPU_PACKET_SIZE                 0x80

// Actual size of each packet:

#define GPU_DESTINATIONPACKET_SIZE      (sizeof(GPU_DESTINATIONPACKET) / sizeof(DWORD))
#define GPU_WINDOWPACKET_SIZE           (sizeof(GPU_WINDOWPACKET) / sizeof(DWORD))
#define GPU_VALUESPACKET_SIZE           (sizeof(GPU_VALUESPACKET) / sizeof(DWORD))
#define GPU_PROGRAMPACKET_SIZE          (sizeof(GPU_PROGRAMPACKET) / sizeof(DWORD))
#define GPU_CONTROLPACKET_SIZE          (sizeof(GPU_CONTROLPACKET) / sizeof(DWORD))
#define GPU_TESSELLATORPACKET_SIZE      (sizeof(GPU_TESSELLATORPACKET) / sizeof(DWORD))
#define GPU_MISCPACKET_SIZE             (sizeof(GPU_MISCPACKET) / sizeof(DWORD))
#define GPU_POINTPACKET_SIZE            (sizeof(GPU_POINTPACKET) / sizeof(DWORD))

//------------------------------------------------------------------------------

typedef enum
{
    GPUREG_WAITUNTIL                    = 0x05C8,       // GPU_WAITUNTIL

    GPUREG_COHERSIZEHOST                = 0x0A2F,       // DWORD
    GPUREG_COHERBASEHOST,                               // DWORD
    GPUREG_COHERSTATUSHOST,                             // GPU_COHERSTATUS

    GPUREG_CPUINTERRUPTACK              = 0x045E,       // GPU_CPUINTERRUPTACK

    GPUREG_CALLBACKADDRESS              = 0x057C,       // DWORD

    GPUREG_GPRMANAGEMENT                = 0x0D00,       // GPU_GPRMANAGEMENT

    GPUREG_INSTSTOREMANAGEMENT          = 0x0D02,       // GPU_INSTSTOREMANAGEMENT

    GPUREG_INVALIDATECONSTANTS          = 0x0D04,       // DWORD

    GPUREG_BCCONTROL                    = 0x0F01,       // GPU_BCCONTROL

    // Packet 0:

    GPUREG_DESTINATIONPACKET            = 0x2000,       // GPU_DESTINATIONPACKET
    GPUREG_PACKET0                      = 0x2000,

    GPUREG_SURFACEINFO                  = 0x2000,       // GPU_SURFACEINFO
    GPUREG_COLOR0INFO,                                  // GPU_COLORINFO
    GPUREG_DEPTHINFO,                                   // GPU_DEPTHINFO
    GPUREG_COLOR1INFO,                                  // GPU_COLORINFO
    GPUREG_COLOR2INFO,                                  // GPU_COLORINFO
    GPUREG_COLOR3INFO,                                  // GPU_COLORINFO
    GPUREG_COHERDESTBASE0,                              // DWORD
    GPUREG_COHERDESTBASE1,                              // DWORD

    GPUREG_COHERDESTBASE2,                              // DWORD
    GPUREG_COHERDESTBASE3,                              // DWORD
    GPUREG_COHERDESTBASE4,                              // DWORD
    GPUREG_COHERDESTBASE5,                              // DWORD
    GPUREG_COHERDESTBASE6,                              // DWORD
    GPUREG_COHERDESTBASE7,                              // DWORD
    GPUREG_SCREENSCISSORTL,                             // GPU_POINT
    GPUREG_SCREENSCISSORBR,                             // GPU_POINT

    // Packet 1:

    GPUREG_WINDOWPACKET                 = 0x2080,       // GPU_WINDOWPACKET
    GPUREG_PACKET1                      = 0x2080,

    GPUREG_WINDOWOFFSET                 = 0x2080,       // GPU_POINT
    GPUREG_WINDOWSCISSORTL,                             // GPU_POINT
    GPUREG_WINDOWSCISSORBR,                             // GPU_POINT

    // Packet 2:

    GPUREG_VALUESPACKET                 = 0x2100,       // GPU_VALUESPACKET
    GPUREG_PACKET2                      = 0x2100,

    GPUREG_MAXVTXINDX                   = 0x2100,       // DWORD
    GPUREG_MINVTXINDX,                                  // DWORD
    GPUREG_INDXOFFSET,                                  // DWORD
    GPUREG_MULTIPRIMIBRESETINDX,                        // DWORD
    GPUREG_COLORMASK,                                   // GPUREG_COLORMASK
    GPUREG_BLENDRED,                                    // float
    GPUREG_BLENDGREEN,                                  // float
    GPUREG_BLENDBLUE,                                   // float

    GPUREG_BLENDALPHA,                                  // float
    GPUREG_UNUSED0,
    GPUREG_UNUSED1,
    GPUREG_UNUSED2,
    GPUREG_STENCILREFMASKBF,                            // GPU_STENCILREFMASK
    GPUREG_STENCILREFMASK,                              // GPU_STENCILREFMASK
    GPUREG_ALPHAREF,                                    // float
    GPUREG_VPORTXSCALE,                                 // float

    GPUREG_VPORTXOFFSET,                                // float
    GPUREG_VPORTYSCALE,                                 // float
    GPUREG_VPORTYOFFSET,                                // float
    GPUREG_VPORTZSCALE,                                 // float
    GPUREG_VPORTZOFFSET,                                // float

    // Packet 3:

    GPUREG_PROGRAMPACKET                = 0x2180,       // GPU_PROGRAMPACKET
    GPUREG_PACKET3                      = 0x2180,

    GPUREG_PROGRAMCONTROL               = 0x2180,       // GPU_PROGRAMCONTROL
    GPUREG_CONTEXTMISC,                                 // GPU_CONTEXTMISC
    GPUREG_INTERPOLATORCONTROL,                         // GPU_INTERPOLATORCONTROL
    GPUREG_WRAPPING0,                                   // GPU_WRAPPING0
    GPUREG_WRAPPING1,                                   // GPU_WRAPPING1

    // Packet 4:

    GPUREG_CONTROLPACKET                = 0x2200,       // GPU_CONTROLPACKET
    GPUREG_PACKET4                      = 0x2200,

    GPUREG_DEPTHCONTROL                 = 0x2200,       // GPU_DEPTHCONTROL
    GPUREG_BLENDCONTROL0,                               // GPU_BLENDCONTROL
    GPUREG_COLORCONTROL,                                // GPU_COLORCONTROL
    GPUREG_HICONTROL,                                   // GPU_HICONTROL
    GPUREG_CLIPCONTROL,                                 // GPU_CLIPCONTROL
    GPUREG_MODECONTROL,                                 // GPU_MODECONTROL
    GPUREG_VTECONTROL,                                  // GPU_VTECONTROL
    GPUREG_UNUSED3,

    GPUREG_EDRAMMODECONTROL,                            // GPU_EDRAMMODECONTROL
    GPUREG_BLENDCONTROL1,                               // GPU_BLENDCONTROL
    GPUREG_BLENDCONTROL2,                               // GPU_BLENDCONTROL
    GPUREG_BLENDCONTROL3,                               // GPU_BLENDCONTROL

    // Packet 5:

    GPUREG_TESSELLATORPACKET            = 0x2280,       // GPU_TESSELLATORPACKET
    GPUREG_PACKET5                      = 0x2280,

    GPUREG_POINTSIZE                    = 0x2280,       // GPU_POINTSIZE
    GPUREG_POINTMINMAX,                                 // GPU_POINTMINMAX
    GPUREG_LINECONTROL,                                 // GPU_LINECONTROL
    GPUREG_UNUSED4,
    GPUREG_OUTPUTPATHCONTROL,                           // GPU_OUTPUTPATHCONTROL
    GPUREG_HOSCONTROL,                                  // GPU_HOSCONTROL
    GPUREG_HOSMAXTESSLEVEL,                             // float
    GPUREG_HOSMINTESSLEVEL,                             // float

    GPUREG_HOSREUSEDEPTH,                               // DWORD
    GPUREG_GROUPPRIMTYPE,                               // GPU_GROUPPRIMTYPE
    GPUREG_GROUPFIRSTDECR,                              // DWORD
    GPUREG_GROUPDECR,                                   // DWORD
    GPUREG_GROUPVECT0CONTROL,                           // GPU_GROUPVECTCONTROL
    GPUREG_GROUPVECT1CONTROL,                           // GPU_GROUPVECTCONTROL
    GPUREG_GROUPVECT0FMTCONTROL,                        // GPU_GROUPVECTFMTCONTROL
    GPUREG_GROUPVECT1FMTCONTROL,                        // GPU_GROUPVECTFMTCONTROL

    GPUREG_UNUSED5,
    GPUREG_UNUSED6,
    GPUREG_MPASSPSCONTROL,                              // GPU_MPASSPSCONTROL
    GPUREG_VIZQUERY,                                    // GPU_VIZQUERY
    GPUREG_ENHANCE,                                     // GPU_ENHANCE

    // Packet 6:

    GPUREG_MISCPACKET                   = 0x2300,       // GPU_MISCPACKET
    GPUREG_PACKET6                      = 0x2300,

    GPUREG_SCLINECONTROL                = 0x2300,       // GPU_SCLINECONTROL
    GPUREG_AACONFIG,                                    // GPU_AACONFIG
    GPUREG_VTXCONTROL,                                  // GPU_VTXCONTROL
    GPUREG_GBVERTCLIPADJ,                               // float
    GPUREG_GBVERTDISCADJ,                               // float
    GPUREG_GBHORZCLIPADJ,                               // float
    GPUREG_GBHORZDISCADJ,                               // float
    GPUREG_VSCONST,                                     // GPU_CONST

    GPUREG_PSCONST,                                     // GPU_CONST
    GPUREG_DEBUGMISC0,                                  // GPU_DEBUGMISC0
    GPUREG_DEBUGMISC1,                                  // GPU_DEBUGMISC1
    GPUREG_UNUSED7,
    GPUREG_UNUSED8,
    GPUREG_UNUSED9,
    GPUREG_UNUSED10,
    GPUREG_UNUSED11,

    GPUREG_UNUSED12,
    GPUREG_UNUSED13,
    GPUREG_AAMASK,                                      // DWORD
    GPUREG_UNUSED14,
    GPUREG_UNUSED15,
    GPUREG_UNUSED16,
    GPUREG_VERTEXREUSEBLOCKCONTROL,                     // GPU_VERTEXREUSEBLOCKCONTROL
    GPUREG_OUTDEALLOCCONTROL,                           // GPU_OUTDEALLOCCONTROL

    GPUREG_COPYCONTROL,                                 // GPU_COPYCONTROL
    GPUREG_COPYDESTBASE,                                // GPU_COPYDESTBASE
    GPUREG_COPYDESTPITCH,                               // GPU_COPYDESTPITCH
    GPUREG_COPYDESTINFO,                                // GPU_COPYDESTINFO
    GPUREG_HICLEAR,                                     // DWORD
    GPUREG_DEPTHCLEAR,                                  // DWORD
    GPUREG_COLORCLEAR,                                  // DWORD
    GPUREG_COLORCLEARLO,                                // DWORD

    GPUREG_COPYFUNC,                                    // GPU_COPYFUNC
    GPUREG_COPYREF,                                     // GPU_COPYREF
    GPUREG_COPYMASK,                                    // GPU_COPYMASK
    GPUREG_COPYSURFACESLICE,                            // DWORD
    GPUREG_SAMPLECOUNTCONTROL,                          // GPU_SAMPLECOUNTCONTROL
    GPUREG_SAMPLECOUNTADDRESS,                          // DWORD

    // Packet 7:

    GPUREG_POINTPACKET                  = 0x2380,       // GPU_POINTPACKET
    GPUREG_PACKET7                      = 0x2380,

    GPUREG_POLYOFFSETFRONTSCALE         = 0x2380,       // float
    GPUREG_POLYOFFSETFRONTOFFSET,                       // float
    GPUREG_POLYOFFSETBACKSCALE,                         // float
    GPUREG_POLYOFFSETBACKOFFSET,                        // float
    GPUREG_POINTXRAD,                                   // float
    GPUREG_POINTYRAD,                                   // float
    GPUREG_POINTCONSTANTSIZE,                           // float
    GPUREG_POINTCULLRAD,                                // float

    // Other:

    GPUREG_CLIPPLANE0                   = 0x2388,       // float[4]
    GPUREG_CLIPPLANE1                   = 0x238C,       // float[4]
    GPUREG_CLIPPLANE2                   = 0x2390,       // float[4]
    GPUREG_CLIPPLANE3                   = 0x2394,       // float[4]
    GPUREG_CLIPPLANE4                   = 0x2398,       // float[4]
    GPUREG_CLIPPLANE5                   = 0x239C,       // float[4]

    GPUREG_ALUCONSTANTS                 = 0x4000,       // float[GPU_ALU_CONSTANTS][4]

    GPUREG_FETCHCONSTANTS               = 0x4800,       // GPUFETCH_CONSTANT[GPU_FETCH_CONSTANTS]

    GPUREG_FLOWCONSTANTS                = 0x4900,       // DWORD[GPU_FLOW_CONSTANTS]
    GPUREG_BOOLEANCONSTANTS             = 0x4900,       // DWORD[GPU_BOOLEAN_CONSTANTS/32]
    GPUREG_INTEGERCONSTANTS             = 0x4908,       // DWORD[GPU_INTEGER_CONSTANTS]

    GPUREG_FLUSHFETCHCONSTANTS          = 0x5000,       // DWORD[3]

} GPUREGISTER;

// When using GPUCOMMANDOP_SET_CONSTANT to set a register, subtract this value
// from the register address:

#define GPU_SET_CONSTANT_REGISTER_OFFSET  0x2000

//------------------------------------------------------------------------------

typedef struct
{
    union {
        DWORD FlowConstant[GPU_FLOW_CONSTANTS];
        struct {
            DWORD BooleanConstant[GPU_BOOLEAN_CONSTANTS/32];
            DWORD IntegerConstant[GPU_INTEGER_CONSTANTS];
        };
    };
} GPUFLOW_CONSTANTS;

__forceinline
void GPU_SET_VERTEX_SHADER_CONSTANTB(
    GPUFLOW_CONSTANTS* pFlowConstants,
    DWORD Register,
    BOOL Boolean)
{
    DWORD index = (Register + GPU_D3D_VERTEX_CONSTANTB_BASE) / 32;
    DWORD shift = (Register % 32);
    pFlowConstants->BooleanConstant[index] &= ~(1 << shift);
    pFlowConstants->BooleanConstant[index] |= ((Boolean & 1) << shift);
}

__forceinline
void GPU_SET_PIXEL_SHADER_CONSTANTB(
    GPUFLOW_CONSTANTS* pFlowConstants,
    DWORD Register,
    BOOL Boolean)
{
    DWORD index = (Register + GPU_D3D_PIXEL_CONSTANTB_BASE) / 32;
    DWORD shift = (Register % 32);
    pFlowConstants->BooleanConstant[index] &= ~(1 << shift);
    pFlowConstants->BooleanConstant[index] |= ((Boolean & 1) << shift);
}

__forceinline
void GPU_SET_VERTEX_SHADER_CONSTANTI(
    GPUFLOW_CONSTANTS* pFlowConstants,
    DWORD Register,
    const int* pConstantData)
{
    DWORD index = Register + GPU_D3D_VERTEX_CONSTANTI_BASE;
    pFlowConstants->IntegerConstant[index] = (DWORD) ((pConstantData[0] & 0xff)
                                                   | ((pConstantData[1] & 0xff) << 8)
                                                   | ((pConstantData[2] & 0xff) << 16));
}

__forceinline
void GPU_SET_PIXEL_SHADER_CONSTANTI(
    GPUFLOW_CONSTANTS* pFlowConstants,
    DWORD Register,
    const int* pConstantData)
{
    DWORD index = Register + GPU_D3D_PIXEL_CONSTANTI_BASE;
    pFlowConstants->IntegerConstant[index] = (DWORD) ((pConstantData[0] & 0xff)
                                                   | ((pConstantData[1] & 0xff) << 8)
                                                   | ((pConstantData[2] & 0xff) << 16));
}

typedef struct
{
    DWORD Width                         : 24;   // DWORD
    DWORD                               : 8;
} GPUTEXTURESIZE_1D;

typedef struct
{
    DWORD Width                         : 13;   // DWORD
    DWORD Height                        : 13;   // DWORD
    DWORD                               : 6;
} GPUTEXTURESIZE_2D;

typedef struct
{
    DWORD Width                         : 13;   // DWORD
    DWORD Height                        : 13;   // DWORD
    DWORD Depth                         : 6;    // DWORD
} GPUTEXTURESIZE_STACK;

typedef struct
{
    DWORD Width                         : 11;   // DWORD
    DWORD Height                        : 11;   // DWORD
    DWORD Depth                         : 10;   // DWORD
} GPUTEXTURESIZE_3D;

typedef union {
    struct {
        // DWORD 0:

        DWORD Type                      : 2;    // GPUCONSTANTTYPE
        DWORD SignX                     : 2;    // GPUSIGN
        DWORD SignY                     : 2;    // GPUSIGN
        DWORD SignZ                     : 2;    // GPUSIGN
        DWORD SignW                     : 2;    // GPUSIGN
        DWORD ClampX                    : 3;    // GPUCLAMP
        DWORD ClampY                    : 3;    // GPUCLAMP
        DWORD ClampZ                    : 3;    // GPUCLAMP
        DWORD                           : 2;
        DWORD                           : 1;
        DWORD Pitch                     : 9;    // DWORD
        DWORD Tiled                     : 1;    // BOOL

        // DWORD 1:

        DWORD DataFormat                : 6;    // GPUTEXTUREFORMAT
        DWORD Endian                    : 2;    // GPUENDIAN
        DWORD RequestSize               : 2;    // GPUREQUESTSIZE
        DWORD Stacked                   : 1;    // BOOL
        DWORD ClampPolicy               : 1;    // GPUCLAMPPOLICY
        DWORD BaseAddress               : 20;   // DWORD

        // DWORD 2:

        union
        {
            GPUTEXTURESIZE_1D OneD;
            GPUTEXTURESIZE_2D TwoD;
            GPUTEXTURESIZE_3D ThreeD;
            GPUTEXTURESIZE_STACK Stack;
        } Size;

        // DWORD 3:

        DWORD NumFormat                 : 1;    // GPUNUMFORMAT
        DWORD SwizzleX                  : 3;    // GPUSWIZZLE
        DWORD SwizzleY                  : 3;    // GPUSWIZZLE
        DWORD SwizzleZ                  : 3;    // GPUSWIZZLE
        DWORD SwizzleW                  : 3;    // GPUSWIZZLE
        INT   ExpAdjust                 : 6;    // int
        DWORD MagFilter                 : 2;    // GPUMINMAGFILTER
        DWORD MinFilter                 : 2;    // GPUMINMAGFILTER
        DWORD MipFilter                 : 2;    // GPUMIPFILTER
        DWORD AnisoFilter               : 3;    // GPUANISOFILTER
        DWORD                           : 3;
        DWORD BorderSize                : 1;    // DWORD

        // DWORD 4:

        DWORD VolMagFilter              : 1;    // GPUMINMAGFILTER
        DWORD VolMinFilter              : 1;    // GPUMINMAGFILTER
        DWORD MinMipLevel               : 4;    // DWORD
        DWORD MaxMipLevel               : 4;    // DWORD
        DWORD MagAnisoWalk              : 1;    // BOOL
        DWORD MinAnisoWalk              : 1;    // BOOL
        INT   LODBias                   : 10;   // int
        INT   GradExpAdjustH            : 5;    // int
        INT   GradExpAdjustV            : 5;    // int

        // DWORD 5:

        DWORD BorderColor               : 2;    // GPUBORDERCOLOR
        DWORD ForceBCWToMax             : 1;    // BOOL
        DWORD TriClamp                  : 2;    // GPUTRICLAMP
        INT   AnisoBias                 : 4;    // int
        DWORD Dimension                 : 2;    // GPUDIMENSION
        DWORD PackedMips                : 1;    // BOOL
        DWORD MipAddress                : 20;   // DWORD
    };
    DWORD dword[6];
} GPUTEXTURE_FETCH_CONSTANT;

typedef union {
    struct {
        // DWORD 0:

        DWORD Type                      : 2;    // GPUCONSTANTTYPE
        DWORD BaseAddress               : 30;   // DWORD

        // DWORD 1:

        DWORD Endian                    : 2;    // GPUENDIAN
        DWORD Size                      : 24;   // DWORD
        DWORD AddressClamp              : 1;    // GPUADDRESSCLAMP
        DWORD                           : 1;
        DWORD RequestSize               : 2;    // GPUREQUESTSIZE
        DWORD ClampDisable              : 2;    // BOOL
    };
    DWORD dword[2];
} GPUVERTEX_FETCH_CONSTANT;

typedef union {
    GPUTEXTURE_FETCH_CONSTANT           Texture;
    GPUVERTEX_FETCH_CONSTANT            Vertex[3];
} GPUFETCH_CONSTANT;

typedef union {
    struct {
        // DWORD 0:

        DWORD Op                        : 5;    // GPUTEXTUREFETCHOP
        DWORD SrcGPR                    : 6;    // DWORD
        DWORD SrcLoopIndexRelative      : 1;    // BOOL
        DWORD DestGPR                   : 6;    // DWORD
        DWORD DestLoopIndexRelative     : 1;    // BOOL
        DWORD FetchValidOnly            : 1;    // BOOL
        DWORD ConstIndex                : 5;    // DWORD
        DWORD UnnormalizedTextureCoords : 1;    // BOOL
        DWORD SrcSelectX                : 2;    // DWORD
        DWORD SrcSelectY                : 2;    // DWORD
        DWORD SrcSelectZ                : 2;    // DWORD

        // DWORD 1:

        DWORD SwizzleX                  : 3;    // GPUSWIZZLE
        DWORD SwizzleY                  : 3;    // GPUSWIZZLE
        DWORD SwizzleZ                  : 3;    // GPUSWIZZLE
        DWORD SwizzleW                  : 3;    // GPUSWIZZLE
        DWORD MagFilter                 : 2;    // GPUMINMAGFILTER
        DWORD MinFilter                 : 2;    // GPUMINMAGFILTER
        DWORD MipFilter                 : 2;    // GPUMIPFILTER
        DWORD AnisoFilter               : 3;    // GPUANISOFILTER
        DWORD                           : 3;
        DWORD VolMagFilter              : 2;    // GPUMINMAGFILTER
        DWORD VolMinFilter              : 2;    // GPUMINMAGFILTER
        DWORD UseComputedLOD            : 1;    // BOOL
        DWORD UseRegisterLOD            : 1;    // BOOL
        DWORD                           : 1;
        DWORD Predicated                : 1;    // BOOL

        // DWORD 2:

        DWORD UseRegisterGradients      : 1;    // BOOL
        DWORD                           : 1;
        INT   LODBias                   : 7;    // int
        DWORD                           : 5;
        DWORD Dimension                 : 2;    // GPUDIMENSION (pseudo register)
        INT   OffsetX                   : 5;    // int
        INT   OffsetY                   : 5;    // int
        INT   OffsetZ                   : 5;    // int
        DWORD PredicationCondition      : 1;    // DWORD
    };
    DWORD dword[3];
} GPUTEXTURE_FETCH_INSTRUCTION;

typedef union {
    struct {
        // DWORD 0:

        DWORD Op                        : 5;    // GPUVERTEXFETCHOP
        DWORD SrcGPR                    : 6;    // DWORD
        DWORD SrcLoopIndexRelative      : 1;    // BOOL
        DWORD DestGPR                   : 6;    // DWORD
        DWORD DestLoopIndexRelative     : 1;    // BOOL
        DWORD FetchValidOnly            : 1;    // Must be 1
        DWORD ConstIndex                : 5;    // DWORD
        DWORD ConstIndexSelect          : 2;    // DWORD
        DWORD PrefetchCount             : 3;    // DWORD
        DWORD SrcSelect                 : 2;    // DWORD

        // DWORD 1:

        DWORD SwizzleX                  : 3;    // GPUSWIZZLE
        DWORD SwizzleY                  : 3;    // GPUSWIZZLE
        DWORD SwizzleZ                  : 3;    // GPUSWIZZLE
        DWORD SwizzleW                  : 3;    // GPUSWIZZLE
        DWORD Signed                    : 1;    // BOOL
        DWORD NumFormat                 : 1;    // GPUNUMFORMAT
        DWORD                           : 1;
        DWORD RoundIndex                : 1;    // BOOL
        DWORD DataFormat                : 6;    // GPUVERTEXFORMAT
        DWORD                           : 2;
        INT   ExpAdjust                 : 6;    // int
        DWORD MiniFetch                 : 1;    // BOOL
        DWORD Predicated                : 1;    // BOOL

        // DWORD 2:

        DWORD Stride                    : 8;    // DWORD
        INT   Offset                    : 23;   // int
        DWORD PredicationCondition      : 1;    // DWORD
    };
    DWORD dword[3];
} GPUVERTEX_FETCH_INSTRUCTION;

//------------------------------------------------------------------------------------------------
// Convert logical vertex stream number to hardware format

__forceinline
void SetVertexStream(GPUVERTEX_FETCH_INSTRUCTION* pVertexFetch, DWORD stream)
{
    // Avoid an integer division by using fixed point math
    static const DWORD kRemainderAndDiv = ((DWORD) ((4.0 / 3.0) * 256 + 1)) << 16
        | (0xffff & (DWORD)((1.0 / 3.0) * 256 + 1));
    DWORD temp = ((GPU_VERTEX_FETCH_CONSTANTS - 1) - stream) * kRemainderAndDiv;
    pVertexFetch->ConstIndex = (temp >> 8) & 0xff;
    pVertexFetch->ConstIndexSelect = (temp >> 24) & 0x3;
}

//------------------------------------------------------------------------------------------------
// Convert hardware format to logical vertex stream

__forceinline
DWORD GetVertexStream(const GPUVERTEX_FETCH_INSTRUCTION* pVertexFetch)
{
    return (GPU_VERTEX_FETCH_CONSTANTS - 1) - (pVertexFetch->ConstIndex * 3 + pVertexFetch->ConstIndexSelect);
}

typedef union {
    struct {
        DWORD Address                   : 12;
        DWORD Count                     : 3;
        DWORD Yield                     : 1;
        DWORD TypeAndSerialize          : 12;   // GPUEXECSERIALIZEMODE, GPUEXECINSTRUCTIONTYPE
        DWORD VertexCacheLow            : 4;    // GPUEXECFETCHCACHETYPE

        DWORD VertexCacheHigh           : 2;    // GPUEXECFETCHCACHETYPE
        DWORD                           : 7;
        DWORD PredicateClean            : 1;
        DWORD                           : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_EXEC;

typedef union {
    struct {
        DWORD Address                   : 12;
        DWORD Count                     : 3;
        DWORD Yield                     : 1;
        DWORD TypeAndSerialize          : 12;
        DWORD VertexCacheLow            : 4;

        DWORD VertexCacheHigh           : 2;
        DWORD BooleanAddress            : 8;
        DWORD Condition                 : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_COND_EXEC;

typedef union {
    struct {
        DWORD Address                   : 12;
        DWORD Count                     : 3;
        DWORD Yield                     : 1;
        DWORD TypeAndSerialize          : 12;
        DWORD VertexCacheLow            : 4;

        DWORD VertexCacheHigh           : 2;
        DWORD                           : 7;
        DWORD PredicateClean            : 1;
        DWORD Condition                 : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_COND_EXEC_PREDS;

typedef union {
    struct {
        DWORD JumpAddress               : 13;
        DWORD Repeat                    : 1;
        DWORD                           : 2;
        DWORD LoopID                    : 5;
        DWORD                           : 11;

        DWORD                           : 11;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_LOOP_START;

typedef union {
    struct {
        DWORD JumpAddress               : 13;
        DWORD                           : 3;
        DWORD LoopID                    : 5;
        DWORD PredicateBreak            : 1;
        DWORD                           : 10;

        DWORD                           : 10;
        DWORD Condition                 : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_LOOP_END;

typedef union {
    struct {
        DWORD JumpAddress               : 13;
        DWORD Force                     : 1;
        DWORD Predicated                : 1;
        DWORD                           : 17;

        DWORD                           : 2;
        DWORD BooleanAddress            : 8;
        DWORD Condition                 : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_COND_CALL;

typedef union {
    struct {
        DWORD                           : 32;

        DWORD                           : 11;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_RETURN;

typedef union {
    struct {
        DWORD JumpAddress               : 13;
        DWORD Force                     : 1;
        DWORD Predicated                : 1;
        DWORD                           : 17;

        DWORD                           : 1;
        DWORD ForwardOnly               : 1;
        DWORD BooleanAddress            : 8;
        DWORD Condition                 : 1;
        DWORD AbsoluteAddressing        : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_COND_JUMP;

typedef union {
    struct {
        DWORD Size                      : 3;
        DWORD                           : 29;

        DWORD                           : 8;
        DWORD DoNotSerialize            : 1;
        DWORD BufferSelect              : 2;    // GPUALLOCBUFFERSELECT
        DWORD                           : 1;
        DWORD Op                        : 4;
    };
} GPUFLOW_ALLOC;

typedef union {
    struct {
        DWORD                           : 32;

        DWORD                           : 12;
        DWORD Op                        : 4;    // GPUFLOWOP
    };
    GPUFLOW_EXEC                        Exec;
    GPUFLOW_COND_EXEC                   CondExec;
    GPUFLOW_COND_EXEC_PREDS             CondExecPreds;
    GPUFLOW_LOOP_START                  LoopStart;
    GPUFLOW_LOOP_END                    LoopEnd;
    GPUFLOW_COND_CALL                   CondCall;
    GPUFLOW_RETURN                      Return;
    GPUFLOW_COND_JUMP                   CondJump;
    GPUFLOW_ALLOC                       Alloc;
    DWORD                               dword[2];
} GPUFLOW_INSTRUCTION;

// Use GPU_GET_FLOW_INSTRUCTIONS() to decode GPUFLOW_INSTRUCTION_PAIR:

typedef struct {
    DWORD                               dword[3];
} GPUFLOW_INSTRUCTION_PAIR;

typedef union {
    struct {
        // DWORD 0:

        DWORD VectorDest                : 6;
        DWORD VectorDestRelative        : 1;
        DWORD AbsConstants              : 1;
        DWORD ScalarDest                : 6;
        DWORD ScalarDestRelative        : 1;    // Also used as export mask
        DWORD ResultExport              : 1;
        DWORD VectorMask                : 4;
        DWORD ScalarMask                : 4;
        DWORD VectorSaturate            : 1;
        DWORD ScalarSaturate            : 1;
        DWORD ScalarOp                  : 6;    // GPUALUSCALAROP

        // DWORD 1:

        DWORD SrcCSwizzle               : 8;
        DWORD SrcBSwizzle               : 8;
        DWORD SrcASwizzle               : 8;
        DWORD SrcCNegate                : 1;
        DWORD SrcBNegate                : 1;
        DWORD SrcANegate                : 1;
        DWORD PredicationCondition      : 1;
        DWORD Predicated                : 1;
        DWORD RelativeAddress           : 1;
        DWORD Constant1Relative         : 1;
        DWORD Constant0Relative         : 1;

        // DWORD 2:

        DWORD SrcCRegConst              : 8;
        DWORD SrcBRegConst              : 8;
        DWORD SrcARegConst              : 8;
        DWORD VectorOp                  : 5;    // GPUALUVECTOROP
        DWORD SrcCSelect                : 1;    // GPUALUSRCSELECT
        DWORD SrcBSelect                : 1;    // GPUALUSRCSELECT
        DWORD SrcASelect                : 1;    // GPUALUSRCSELECT
    };
    DWORD dword[3];
} GPUALU_INSTRUCTION;

//------------------------------------------------------------------------------
// used for GPUALUSCALAROP_MULC0..GPUALUSCALAROP_SUBC1

static __forceinline
DWORD GPU_GET_SCALAROP_SOURCE2_REG(
    CONST GPUALU_INSTRUCTION* pALU)
{
    return (pALU->SrcCSwizzle & 0x3C)
        | (pALU->SrcCSelect << 1)
        | (pALU->ScalarOp & 1);
}

static __forceinline
VOID GPU_SET_SCALAROP_SOURCE2_REG(
    GPUALU_INSTRUCTION* pALU,
    DWORD reg)
{
    pALU->SrcCSwizzle =
        (pALU->SrcCSwizzle & 0xC3)
        | (reg & 0x3C);
    pALU->SrcCSelect = reg >> 1;
    pALU->ScalarOp =
        (pALU->ScalarOp & 0xFE)
        | (reg & 1);
}

//------------------------------------------------------------------------------

typedef union {
    GPUTEXTURE_FETCH_INSTRUCTION        TextureFetch;
    GPUVERTEX_FETCH_INSTRUCTION         VertexFetch;
    GPUFLOW_INSTRUCTION                 Flow0;
    GPUFLOW_INSTRUCTION_PAIR            FlowPair;
    GPUALU_INSTRUCTION                  Alu;
} GPUSHADER_INSTRUCTION;

//------------------------------------------------------------------------------

static __forceinline
void GPU_GET_FLOW_INSTRUCTIONS(
    const GPUFLOW_INSTRUCTION_PAIR* pPair,
    GPUFLOW_INSTRUCTION* pFlow0,
    GPUFLOW_INSTRUCTION* pFlow1)
{
    pFlow0->dword[0] = pPair->dword[0];
    pFlow0->dword[1] = 0xffff & pPair->dword[1]; // Load low 16 bits.
    pFlow1->dword[0] = (pPair->dword[1] >> 16) | (pPair->dword[2] << 16);
    pFlow1->dword[1] = (pPair->dword[2] >> 16);
}

static __forceinline
void GPU_PUT_FLOW_INSTRUCTIONS(
    GPUFLOW_INSTRUCTION_PAIR* pPair,
    const GPUFLOW_INSTRUCTION* pFlow0,
    const GPUFLOW_INSTRUCTION* pFlow1)
{
    pPair->dword[0] = pFlow0->dword[0];
    pPair->dword[1] = (pFlow0->dword[1] & 0xffff) | (pFlow1->dword[0] << 16);
    pPair->dword[2] = (pFlow1->dword[0] >> 16) | (pFlow1->dword[1] << 16);
}

//------------------------------------------------------------------------------

typedef union {
    struct {
        // float x:

        DWORD BaseAddress                   : 30; // Physical address in DWORDs
        DWORD _01                           : 2;  // Must be 01

        // float y:

        DWORD _4B000000                     : 32; // Must be 0x4B000000

        // float z:

        DWORD EndianSwap                    : 3; // GPUENDIAN128
        DWORD                               : 5;
        DWORD Format                        : 6; // GPUCOLORFORMAT
        DWORD                               : 2;
        DWORD NumericType                   : 3; // GPUSURFACENUMBER
        DWORD ComponentSwap                 : 1; // GPUSURFACESWAP
        DWORD _4B0                          : 12; // Must be 0x4B0

        // float w:

        DWORD StreamMaxIndex                : 23; // 0..StreamMaxIndex - 1 is valid
        DWORD _96                           : 9;  // Must be 0x96
    };
    DWORD dword[4];
    float c[4];
} GPU_MEMEXPORT_STREAM_CONSTANT;

//------------------------------------------------------------------------------

#ifdef _DEBUG

void GPU_SET_MEMEXPORT_STREAM_CONSTANT(
    GPU_MEMEXPORT_STREAM_CONSTANT* pConstant,
    VOID* pBaseAddress,
    DWORD StreamMaxIndex,
    GPUSURFACESWAP ComponentSwap,
    GPUSURFACENUMBER NumericType,
    GPUCOLORFORMAT Format,
    GPUENDIAN128 EndianSwap);

#else

__forceinline
void GPU_SET_MEMEXPORT_STREAM_CONSTANT(
    GPU_MEMEXPORT_STREAM_CONSTANT* pConstant,
    VOID* pBaseAddress,
    DWORD StreamMaxIndex,
    GPUSURFACESWAP ComponentSwap,
    GPUSURFACENUMBER NumericType,
    GPUCOLORFORMAT Format,
    GPUENDIAN128 EndianSwap)
{
    pConstant->_01 = 0x01;
    pConstant->BaseAddress = GPU_CONVERT_CPU_TO_GPU_ADDRESS(pBaseAddress) >> 2;
    pConstant->_4B000000 = 0x4b000000;
    pConstant->_4B0 = 0x4b0;
    pConstant->ComponentSwap = ComponentSwap;
    pConstant->NumericType = NumericType;
    pConstant->Format = Format;
    pConstant->EndianSwap = EndianSwap;
    pConstant->_96 = 0x96;
    pConstant->StreamMaxIndex = StreamMaxIndex;
}

#endif // _DEBUG

//------------------------------------------------------------------------------

typedef union {
    struct {
        // DWORD 0:

        DWORD VizQueryId                    : 6;
        DWORD                               : 2;
        DWORD UseVizQuery                   : 1;
        DWORD                               : 23;

        // DWORD 1:

        DWORD PrimType                      : 6;    // GPUPRIMTYPE
        DWORD SrcSelect                     : 2;    // Must be GPUINDEXSELECT_DMA
        DWORD MajorMode                     : 3;
        DWORD IndexType                     : 1;    // GPUINDEXTYPE
        DWORD NotEndOfPacket                : 1;
        DWORD                               : 3;
        DWORD NumIndices                    : 16;

        // DWORD 2:

        DWORD IndexBase                     : 32;

        // DWORD 3:

        DWORD IndexSize                     : 24;
        DWORD                               : 6;
        DWORD Endian                        : 2;    // GPUENDIAN
    };
    DWORD dword[4];
} GPUCOMMAND_DRAW_INDEX;

typedef union {
    struct {
        // DWORD 0:

        DWORD VizQueryId                    : 6;
        DWORD UseVizQuery                   : 1;
        DWORD                               : 25;

        // DWORD 1:

        DWORD PrimType                      : 6;    // GPUPRIMTYPE
        DWORD SrcSelect                     : 2;    // Must be GPUINDEXSELECT_AUTO
        DWORD MajorMode                     : 3;
        DWORD IndexSize                     : 1;
        DWORD NotEndOfPacket                : 1;
        DWORD                               : 3;
        DWORD NumIndices                    : 16;
    };
    DWORD dword[2];
} GPUCOMMAND_DRAW_AUTO;

typedef union {
    struct {
        // DWORD 0:

        DWORD PrimType                      : 6;    // GPUPRIMTYPE
        DWORD SrcSelect                     : 2;    // Must be GPUINDEXSELECT_IMMEDIATE
        DWORD MajorMode                     : 3;
        DWORD IndexType                     : 1;    // GPUINDEXTYPE
        DWORD NotEndOfPacket                : 1;
        DWORD                               : 3;
        DWORD NumIndices                    : 16;

        // DWORD 1:

        union {
            WORD Index16[];
            DWORD Index32[];
        };
    };
    DWORD dword[1];
} GPUCOMMAND_DRAW_IMMEDIATE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Register                      : 13;
        DWORD                               : 17;
        DWORD OrImmediate                   : 1;
        DWORD AndImmediate                  : 1;

        // DWORD 1:

        union {
            struct {
                DWORD AndMask               : 32;
            };
            struct {
                DWORD AndRegister           : 13;
                DWORD                       : 19;
            };
        };

        // DWORD 2:

        union {
            struct {
                DWORD OrMask                : 32;
            };
            struct {
                DWORD OrRegister            : 13;
                DWORD                       : 19;
            };
        };
    };
    DWORD dword[3];
} GPUCOMMAND_REG_RMW;

typedef union {
    struct {
        // DWORD 0:

        DWORD DestinationSize               : 4;
        DWORD DestinationDisable            : 1;
        DWORD DestinationAddress            : 27;

        // DWORD 1:

        DWORD WindowSize                    : 4;
        DWORD WindowDisable                 : 1;
        DWORD WindowAddress                 : 27;

        // DWORD 2:

        DWORD ValuesSize                    : 4;
        DWORD ValuesDisable                 : 1;
        DWORD ValuesAddress                 : 27;

        // DWORD 3:

        DWORD ProgramSize                   : 4;
        DWORD ProgramDisable                : 1;
        DWORD ProgramAddress                : 27;

        // DWORD 4:

        DWORD ControlSize                   : 4;
        DWORD ControlDisable                : 1;
        DWORD ControlAddress                : 27;

        // DWORD 5:

        DWORD TessellatorSize               : 4;
        DWORD TessellatorDisable            : 1;
        DWORD TessellatorAddress            : 27;

        // DWORD 6:

        DWORD MiscSize                      : 4;
        DWORD MiscDisable                   : 1;
        DWORD MiscAddress                   : 27;

        // DWORD 7:

        DWORD PointSize                     : 4;
        DWORD PointDisable                  : 1;
        DWORD PointAddress                  : 27;

        // DWORD 8:

        DWORD                               : 4;
        DWORD VertexShaderDisable           : 1;
        DWORD VertexShaderAddress           : 27;

        // DWORD 9:

        DWORD VertexShaderSize              : 14;
        DWORD                               : 18;

        // DWORD 10:

        DWORD                               : 4;
        DWORD PixelShaderDisable            : 1;
        DWORD PixelShaderAddress            : 27;

        // DWORD 11:

        DWORD PixelShaderSize               : 14;
        DWORD                               : 18;
    };
    DWORD dword[10];
} GPUCOMMAND_SET_STATE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Offset                        : 11;
        DWORD                               : 5;
        DWORD Id                            : 8;    // GPUCONSTANTID
        DWORD                               : 8;

        // DWORD 1:

        DWORD Data[];
    };
    DWORD dword[1];
} GPUCOMMAND_SET_CONSTANT;

typedef union {
    struct {
        // DWORD 0:

        DWORD                               : 2;
        DWORD Address                       : 30;

        // DWORD 1:

        DWORD Offset                        : 11;
        DWORD                               : 21;

        // DWORD 2:

        DWORD Size                          : 12;
        DWORD                               : 20;
    };
    DWORD dword[3];
} GPUCOMMAND_LOAD_ALU_CONSTANT;

typedef union {
    struct {
        // DWORD 0:

        DWORD Type                          : 2;    // GPULOADTYPE
        DWORD                               : 3;
        DWORD Address                       : 27;

        // DWORD 1:

        DWORD Size                          : 14;
        DWORD                               : 2;
        DWORD Start                         : 12;   // Unused unless GPULOADTYPE_SHARED
        DWORD                               : 4;
    };
    DWORD dword[2];
} GPUCOMMAND_LOAD_SHADER;

typedef union {
    struct {
        DWORD Destination                   : 1;
        DWORD Window                        : 1;
        DWORD Values                        : 1;
        DWORD Program                       : 1;
        DWORD Control                       : 1;
        DWORD Tessellator                   : 1;
        DWORD Misc                          : 1;
        DWORD Point                         : 1;
        DWORD VertexShader                  : 1;
        DWORD PixelShader                   : 1;
        DWORD AluConstantBase               : 1;
        DWORD FetchConstantBase             : 1;
        DWORD IncrementalRegisterBase       : 1;
        DWORD BooleanBase                   : 1;
        DWORD IntegerBase                   : 1;
        DWORD                               : 17;
    };
    DWORD dword[1];
} GPUCOMMAND_INVALIDATE_STATE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Function                      : 3;    // GPUSYNCFUNCTION
        DWORD                               : 1;
        DWORD MemSpace                      : 1;    // GPUSYNCSPACE
        DWORD                               : 27;

        // DWORD 1:

        union {
            struct {
                DWORD Endian                : 2;    // GPUENDIAN
                DWORD Address               : 30;
            };
            struct {
                DWORD Register              : 15;
                DWORD                       : 17;
            };
        };

        // DWORD 2:

        DWORD Reference                     : 32;

        // DWORD 3:

        DWORD Mask                          : 32;

        // DWORD 4:

        DWORD WaitInterval                  : 16;
        DWORD                               : 16;
    };
    DWORD dword[5];
} GPUCOMMAND_WAIT_REG_MEM;

typedef union {
    struct {
        // DWORD 0:

        DWORD Register                      : 15;
        DWORD                               : 17;

        // DWORD 1:

        DWORD Reference                     : 32;

        // DWORD 2:

        DWORD Mask                          : 32;

        // DWORD 3:

        DWORD WaitInterval                  : 16;
        DWORD                               : 16;
    };
    DWORD dword[4];
} GPUCOMMAND_WAIT_REG_EQ;

typedef union {
    struct {
        // DWORD 0:

        DWORD Register                      : 15;
        DWORD                               : 17;

        // DWORD 1:

        DWORD Reference                     : 32;

        // DWORD 2:

        DWORD Mask                          : 32;

        // DWORD 3:

        DWORD WaitInterval                  : 16;
        DWORD                               : 16;
    };
    DWORD dword[4];
} GPUCOMMAND_WAIT_REG_GTE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;

        // DWORD 1:

        DWORD Data                          : 32;
    };
    DWORD dword[2];
} GPUCOMMAND_MEM_WRITE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Function                      : 3;    // GPUSYNCFUNCTION
        DWORD                               : 1;
        DWORD PollSpace                     : 1;    // GPUSYNCSPACE
        DWORD                               : 3;
        DWORD WriteSpace                    : 1;    // GPUSYNCSPACE
        DWORD                               : 23;

        // DWORD 1:

        union {
            struct {
                DWORD PollEndian            : 2;    // GPUENDIAN
                DWORD PollAddress           : 30;
            };
            struct {
                DWORD PollRegister          : 15;
                DWORD                       : 17;
            };
        };

        // DWORD 2:

        DWORD Reference                     : 32;

        // DWORD 3:

        DWORD Mask                          : 32;

        // DWORD 4:

        union {
            struct {
                DWORD WriteEndian           : 2;    // GPUENDIAN
                DWORD WriteAddress          : 30;
            };
            struct {
                DWORD WriteRegister         : 15;
                DWORD                       : 17;
            };
        };

        // DWORD 5:

        DWORD Data                          : 32;
    };
    DWORD dword[6];
} GPUCOMMAND_COND_WRITE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;
    };
    DWORD dword[1];
} GPUCOMMAND_MEM_WRITE_COUNTER;

typedef union {
    struct {
        // DWORD 0:

        DWORD Initiator                     : 6;    // GPUINITIATOR
        DWORD                               : 24;
        DWORD SoftwareManaged               : 1;
        DWORD UseCounter                    : 1;

        // DWORD 1:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;

        // DWORD 2:

        DWORD Data                          : 32;
    };
    DWORD dword[3];
} GPUCOMMAND_EVENT_WRITE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Initiator                     : 6;    // GPUINITIATOR
        DWORD                               : 24;
        DWORD SoftwareManaged               : 1;
        DWORD UseCounter                    : 1;

        // DWORD 1:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;

        // DWORD 2:

        DWORD Data                          : 32;
    };
    DWORD dword[3];
} GPUCOMMAND_EVENT_WRITE_SHADER;

typedef union {
    struct {
        // DWORD 0:

        DWORD Initiator                     : 6;    // GPUINITIATOR
        DWORD                               : 25;
        DWORD UseCounter                    : 1;

        // DWORD 1:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;

        // DWORD 2:

        DWORD Data                          : 32;
    };
    DWORD dword[3];
} GPUCOMMAND_EVENT_WRITE_CACHE_FLUSH;

typedef union {
    struct {
        // DWORD 0:

        DWORD Initiator                     : 6;    // GPUINITIATOR
        DWORD                               : 26;

        // DWORD 1:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;
    };
    DWORD dword[2];
} GPUCOMMAND_EVENT_WRITE_SCREEN_EXTENT;

typedef union {
    struct {
        DWORD Initiator                     : 6;    // GPUINITIATOR
        DWORD                               : 26;
    };
    DWORD dword[1];
} GPUCOMMAND_EVENT_WRITE_ZPASS_DONE;

typedef union {
    struct {
        // DWORD 0:

        DWORD Register                      : 15;
        DWORD                               : 17;

        // DWORD 1:

        DWORD Endian                        : 2;    // GPUENDIAN
        DWORD Address                       : 30;
    };
    DWORD dword[2];
} GPUCOMMAND_REG_TO_MEM;

typedef union {
    struct {
        DWORD Unused                        : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_WAIT_FOR_IDLE;

typedef union {
    struct {
        DWORD Cpu0                          : 1;
        DWORD Cpu1                          : 1;
        DWORD Cpu2                          : 1;
        DWORD Cpu3                          : 1;
        DWORD Cpu4                          : 1;
        DWORD Cpu5                          : 1;
        DWORD                               : 26;
    };
    DWORD dword[1];
} GPUCOMMAND_CPU_INTERRUPT;

typedef union {
    struct {
        DWORD Id                            : 6;
        DWORD                               : 2;
        DWORD End                           : 1;
        DWORD                               : 23;
    };
    DWORD dword[1];
} GPUCOMMAND_VIZ_QUERY;

typedef union {
    struct {
        // DWORD 0:

        DWORD PrimType                      : 6;    // Must be GPUPRIMTYPE_RECTLIST
        DWORD SrcSelect                     : 2;    // Must be GPUINDEXSELECT_IMMEDIATE
        DWORD MajorMode                     : 3;
        DWORD IndexType                     : 1;    // GPUINDEXTYPE
        DWORD NotEndOfPacket                : 1;
        DWORD                               : 3;
        DWORD NumIndices                    : 16;

        // DWORD 1:

        DWORD NumBaseIndices                : 14;
        DWORD                               : 18;

        // DWORD 2:

        DWORD Index32[];
    };
    DWORD dword[2];
} GPUCOMMAND_MPEG_INDEX;

typedef union {
    struct {
        DWORD Unused                        : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_NOP;

typedef union {
    struct {
        // DWORD 0:

        DWORD Address                       : 32;

        // DWORD 1:

        DWORD Size                          : 20;
        DWORD                               : 11;
        DWORD MultiPass                     : 1;
    };
    DWORD dword[2];
} GPUCOMMAND_INDIRECT_BUFFER;

typedef union {
    struct {
        // DWORD 0:

        DWORD Register                      : 15;
        DWORD                               : 17;

        // DWORD 1:

        DWORD Value                         : 8;
        DWORD                               : 24;
    };
    DWORD dword[2];
} GPUCOMMAND_FIX_2_FLT_REG;

typedef union {
    struct {
        DWORD Unused                        : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_CONTEXT_UPDATE;

#ifndef XAM_BUILD

typedef union {
    struct {
        // DWORD 0:

        DWORD Low                           : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_SET_BIN_MASK_LO;

typedef union {
    struct {
        // DWORD 0:

        DWORD High                          : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_SET_BIN_MASK_HI;

typedef union {
    struct {
        // DWORD 0:

        DWORD Low                           : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_SET_BIN_SELECT_LO;

typedef union {
    struct {
        // DWORD 0:

        DWORD High                          : 32;
    };
    DWORD dword[1];
} GPUCOMMAND_SET_BIN_SELECT_HI;

#endif

//------------------------------------------------------------------------------
// GPU performce counters

//------------------------------------------------------------------------------
// GPU performance counter event select enums

typedef enum
{
    GPUPE_CP_COUNT = 0, // Always Count
    GPUPE_CP_RBIU_STALL = 1, // RBIU Transaction FIFO FUll
    GPUPE_CP_RBIU_TAF = 2, // RBIU Transaction Almost FIFO
    GPUPE_CP_PFP_STALL = 3, // PFP Transaction is Waiting for RBBM in RCIU
    GPUPE_CP_RESERVED0 = 4, // Unused
    GPUPE_CP_RESERVED1 = 5, // Unused
    GPUPE_CP_NRT_RCIU_STALL = 6, // Transaction is Waiting for RBBM in RCIU
    GPUPE_CP_Reserved2 = 7, // Unused
    GPUPE_CP_NRT_MIU_STALL = 8, // CSF Fetcher Waiting on MIU
    GPUPE_CP_CSF_PFP_I1_FULL = 9, // CSF PFP I1 Request FIFO is FUll
    GPUPE_CP_CSF_PFP_I2_FULL = 10, // CSF PFP I2 Request FIFO is FUll
    GPUPE_CP_CSF_PFP_REQ_FULL = 11, // CSF PFP State Request FIFO is FUll
    GPUPE_CP_RESERVED3 = 12, // Unused
    GPUPE_CP_RING_ROQ_FULL = 13, // Ring Reorder Queue is Full
    GPUPE_CP_I1_ROQ_FULL = 14, // I1 Reorder Queue is Full
    GPUPE_CP_I2_ROQ_FULL = 15, // I2 Reorder Queue is Full
    GPUPE_CP_ST_ROQ_FULL = 16, // State Reorder Queue is Full
    GPUPE_CP_RTST_ROQ_FULL = 17, // Vertex Shader Early Fetch Done
    GPUPE_CP_MIU_TAG_MEM_FULL = 18, // MIU Tag Memory is Full
    GPUPE_CP_MIU_WRITECLEAN = 19, // MIU WriteClean is In-Progress
    GPUPE_CP_RESERVED4 = 20, // Unused
    GPUPE_CP_RESERVED5 = 21, // Unused
    GPUPE_CP_NRT_WRITE_STALL = 22, // Write Request Stalled by MIU Input FIFO
    GPUPE_CP_NRT_READ_STALL = 23, // Read Request Stalled by MIU Input FIFO
    GPUPE_CP_WC_FIFO_FULL = 24, // Write Confirm FIFO is FULL
    GPUPE_CP_VTX_DEALLOC_FIFO_FULL = 25, // Vertex Shader Dealloc FIFO is FULL
    GPUPE_CP_PIX_DEALLOC_FIFO_FULL = 26, // Pixel Shader Dealloc FIFO is FULL
    GPUPE_CP_VTX_EVENT_FIFO_FULL = 27, // Vertex Shader Event FIFO is FULL
    GPUPE_CP_PIX_EVENT_FIFO_FULL = 28, // Pixel Shader Event FIFO is FULL
    GPUPE_CP_CF_EVENT_FIFO_FULL = 29, // Cache Flush Event FIFO is FULL
    GPUPE_CP_ME_RB_STARVED = 30, // Micro Engine's RB Processing Starved by PFP
    GPUPE_CP_ME_I1_STARVED = 31, // Micro Engine's I1 Processing Starved by PFP
    GPUPE_CP_ME_I2_STARVED = 32, // Micro Engine's I2 Processing Starved by PFP
    GPUPE_CP_ME_ST_STARVED = 33, // Micro Engine's ST Processing Starved by PFP
    GPUPE_CP_RESERVED6 = 34, // Unused
    GPUPE_CP_RESERVED7 = 35, // Unused
    GPUPE_CP_RESERVED8 = 36, // Unused
    GPUPE_CP_RESERVED9 = 37, // Unused
    GPUPE_CP_RESERVED10 = 38, // Unused
    GPUPE_CP_RESERVED11 = 39, // Unused
    GPUPE_RCIU_RBBM_DWORD_SENT = 40, // RCIU is sending data to the RBBM
    GPUPE_ME_PARSER_BUSY_CLOCKS = 41, // Micro Engine�s Parser is Busy
    GPUPE_ME_WAIT_CONTEXT_AVAIL = 42, // Micro Engine is waiting for an available context
    GPUPE_PFP_TYPE0_PACKET = 43, // PFP processed a Type-0 packet
    GPUPE_PFP_TYPE3_PACKET = 44, // PFP processed a Type-3 packet
    GPUPE_CSF_RB_WPTR_NEQ_RPTR = 45, // The CSF has more data to fetch from the Ring Command buffer
    GPUPE_CSF_I1_SIZE_NEQ_ZERO = 46, // The CSF has more data to fetch from the Indirect1 Command buffer
    GPUPE_CSF_I2_SIZE_NEQ_ZERO = 47, // The CSF has more data to fetch from the Indirect2 Command buffer
    GPUPE_CSF_RB_I1_I2_FETCHING = 48, // The CSF has more data to fetch from any of the Command buffers (Ring/Indirect1/Indirect2)
    GPUPE_CP_RESERVED12 = 49, // Unused
    GPUPE_CP_RESERVED13 = 50, // Unused
    GPUPE_CP_RESERVED14 = 51, // Unused
    GPUPE_CP_RESERVED15 = 52, // Unused
    GPUPE_CP_RESERVED16 = 53, // Unused
    GPUPE_CP_RESERVED17 = 54, // Unused
    GPUPE_CP_RESERVED18 = 55, // Unused
    GPUPE_CP_RESERVED19 = 56, // Unused
    GPUPE_CP_RESERVED20 = 57, // Unused
    GPUPE_CP_RESERVED21 = 58, // Unused
    GPUPE_CP_RESERVED22 = 59, // Unused
    GPUPE_CP_RESERVED23 = 60, // Unused
    GPUPE_CP_RESERVED24 = 61, // Unused
    GPUPE_CP_RESERVED25 = 62, // Unused
    GPUPE_CP_RESERVED26 = 63, // Unused
} GPUPERFEVENT_CP;

typedef enum
{
    GPUPE_RBBM_COUNT = 0, // Count Number of Clocks
    GPUPE_RBBM_NRT_BUSY = 1, // Non-Real-Time Busy
    GPUPE_RBBM_BC_CNTX0_BUSY = 2,
    GPUPE_RBBM_BC_CNTX17_BUSY = 3,
    GPUPE_RBBM_SQ_CNTX0_BUSY = 4,
    GPUPE_RBBM_SQ_CNTX17_BUSY = 5,
    GPUPE_RBBM_VGT_BUSY = 6,
    GPUPE_RBBM_VGT_NODMA_BUSY = 7,
    GPUPE_RBBM_PA_BUSY = 8,
    GPUPE_RBBM_SC_CNTX0_BUSY = 9,
    GPUPE_RBBM_SC_CNTX17_BUSY = 10,
    GPUPE_RBBM_TPC_BUSY = 11,
    GPUPE_RBBM_TC_BUSY = 12,
    GPUPE_RBBM_SX_BUSY = 13,
    GPUPE_RESERVED1 = 14,
    GPUPE_RBBM_CP_COHER_BUSY = 15,
    GPUPE_RBBM_CP_NRT_BUSY = 16,
    GPUPE_RESERVED2 = 17,
    GPUPE_RBBM_CP_DMA_BUSY = 18,
    GPUPE_RESERVED3 = 19,
    GPUPE_RESERVED4 = 20,
    GPUPE_RESERVED5 = 21,
    GPUPE_RBBM_DMA_IDLE_STALL = 22, // Non-RT Waiting for CP`s DMA to go Idle
    GPUPE_RESERVED6 = 23,
    GPUPE_RESERVED7 = 24,
    GPUPE_RBBM_GFX_IDLE_STALL = 25, // Non-RT Waiting for Graphics Pipe to be Idle
    GPUPE_RBBM_GFX_IDLEC_STALL = 26, // Non-RT Waiting for Graphics Pipe to be Idle and Clean
    GPUPE_RBBM_INTERRUPT = 27, // Combined Interrupt Signal to the BIF
} GPUPERFEVENT_RBBM;

typedef enum
{
    GPUPE_SQ_PIXEL_VECTORS_SUB = 0, // Number of pixel vectors submitted
    GPUPE_SQ_VERTEX_VECTORS_SUB = 1, // Number of vertex vectors submitted
    GPUPE_SQ_ALU0_ACTIVE_VTX_SIMD0 = 2, // Number of cycles SIMD0's ALU0 is executing vertex shader instructions.
    GPUPE_SQ_ALU1_ACTIVE_VTX_SIMD0 = 3, // Number of cycles ALU 1 is active (vertex) for SIMD0
    GPUPE_SQ_ALU0_ACTIVE_PIX_SIMD0 = 4, // Number of cycles ALU 0 is active (pixel) for SIMD0
    GPUPE_SQ_ALU1_ACTIVE_PIX_SIMD0 = 5, // Number of cycles ALU 1 is active (pixel) for SIMD0
    GPUPE_SQ_ALU0_ACTIVE_VTX_SIMD1 = 6, // Number of cycles ALU 0 is active (vertex) for SIMD1
    GPUPE_SQ_ALU1_ACTIVE_VTX_SIMD1 = 7, // Number of cycles ALU 1 is active (vertex) for SIMD1
    GPUPE_SQ_ALU0_ACTIVE_PIX_SIMD1 = 8, // Number of cycles ALU 0 is active (pixel) for SIMD1
    GPUPE_SQ_ALU1_ACTIVE_PIX_SIMD1 = 9, // Number of cycles ALU 1 is active (pixel) for SIMD1
    GPUPE_SQ_EXPORT_CYCLES = 10, // Number of clocks the SQ is exporting data
    GPUPE_SQ_ALU_CST_WRITTEN = 11, // Number of ALU constants written from the CP for both pix/vtx
    GPUPE_SQ_TEX_CST_WRITTEN = 12, // Number of texture constants written from the CP for both pix/vtx
    GPUPE_SQ_ALU_CST_STALL = 13, // Number of clocks the constant memory is stalled because of ALU constant store full
    GPUPE_SQ_ALU_TEX_STALL = 14, // Number of clocks the constant memory is stalled because of texture constant store full
    GPUPE_SQ_INST_WRITTEN = 15, // Number of instructions written from the CP for both pix/vtx
    GPUPE_SQ_BOOLEAN_WRITTEN = 16, // Number of control flow booleans written from the CP for both pix/vtx
    GPUPE_SQ_LOOPS_WRITTEN = 17, // Number of control flow loops written from the CP for both pix/vtx
    GPUPE_SQ_PIXEL_SWAP_IN = 18, // Number of times a pixel vector is de-activated, this should be (number of clauses) * number of threads
    GPUPE_SQ_PIXEL_SWAP_OUT = 19, // Number of times a pixel vector is activated, this should be (number of clauses - 1) * number of threads
    GPUPE_SQ_VERTEX_SWAP_IN = 20, // Number of times a vertex vector is de-activated, this should be (number of clauses) * number of threads
    GPUPE_SQ_VERTEX_SWAP_OUT = 21, // Number of times a vertex vector is activated, this should be (number of clauses - 1) * number of threads
    GPUPE_SQ_ALU_VTX_INST_ISSUED = 22, // Number of ALU instruction issued (vertex) include all SIMDS and ALU 0/1
    GPUPE_SQ_TEX_VTX_INST_ISSUED = 23, // Number of Texture instruction issued (vertex)
    GPUPE_SQ_VC_VTX_INST_ISSUED = 24, // Number of VC instruction issued (vertex)
    GPUPE_SQ_CF_VTX_INST_ISSUED = 25, // Number of control flow instruction issued (vertex) include all resources (TP,VC,all SIMDS)
    GPUPE_SQ_ALU_PIX_INST_ISSUED = 26, // Number of ALU instruction issued (pixel) include all SIMDS and ALU 0/1
    GPUPE_SQ_TEX_PIX_INST_ISSUED = 27, // Number of Texture instruction issued (pixel)
    GPUPE_SQ_VC_PIX_INST_ISSUED = 28, // Number of VC instruction issued (pixel)
    GPUPE_SQ_CF_PIX_INST_ISSUED = 29, // Number of control flow instruction issued (pixel) include all resources (TP,VC,all SIMDS)
    GPUPE_SQ_ALU0_FIFO_EMPTY_SIMD0 = 30, // aka 'SQ_ALU0_STALL_SIMD0'. Number of clocks SIMD0's ALU0 and ALU1 were both idle, when there was any pixel or vertex threads in the RS.
    GPUPE_SQ_ALU1_FIFO_EMPTY_SIMD0 = 31, // Number of clocks ALU 1 FIFO was empty (busy with control flow) for SIMD0, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU0_FIFO_EMPTY_SIMD1 = 32, // Number of clocks ALU 0 FIFO was empty (busy with control flow) for SIMD1, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU1_FIFO_EMPTY_SIMD1 = 33, // Number of clocks ALU 1 FIFO was empty (busy with control flow) for SIMD1, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU_NOPS = 34, // Number of ALU NOPs generated by the SQ. This counts the number of added ALU instructions because of the use of the address register immediately after the address is set
    GPUPE_SQ_PRED_SKIP = 35, // Always zero
    GPUPE_SQ_SYNC_ALU_STALL_SIMD0_VTX = 36, // Number of cycles all vertex threads are blocked because of synchronization (Alu SIMD0). This is all threads that want to go to Alu SIMD0 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD1_VTX = 37, // Number of cycles all vertex threads are blocked because of synchronization (Alu SIMD1) This is all threads that want to go to Alu SIMD1 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_TEX_STALL_VTX = 38, // Number of cycles all vertex threads are blocked because of synchronization (Texture) This is all threads that want to go to the TP but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_VC_STALL_VTX = 39, // Number of cycles all vertex threads are blocked because of synchronization (VC) This is all threads that want to go to the VC but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_CONSTANTS_USED_SIMD0 = 40, // Number of ALU constants used for SIMD0. Decodes the instruction to count how many constants there are (1,2,3)
    GPUPE_SQ_CONSTANTS_SENT_SP_SIMD0 = 41, // Number of ALU constants sent to the SP for SIMD0. This is the number of reads to the constant store. If greather than SQ_CONSTANTS_USED, an instruction was recirculated because of waterfaling.
    GPUPE_SQ_GPR_STALL_VTX = 42, // Number of stall cycles because of GPR resource (vertex). This prevents vertexes to be sent from the VGT.
    GPUPE_SQ_GPR_STALL_PIX = 43, // Number of stall cycles because of GPR resource (pixel). This prevents pixels to be sent from the SC.
    GPUPE_SQ_VTX_RS_STALL = 44, // Number of stall cycles because of Reservation Station (vertex). This prevents vertexes to be sent from the VGT.
    GPUPE_SQ_PIX_RS_STALL = 45, // Number of stall cycles because of Reservation Station (pixel). This prevents pixels to be sent from the SC.
    GPUPE_SQ_SX_PC_FULL  = 46, // Number of cycles with parameter cache preventing export of vertex vector
    GPUPE_SQ_SX_EXP_BUFF_FULL = 47, // Number of cycles with export buffers preventing export
    GPUPE_SQ_SX_POS_BUFF_FULL = 48, // Number of cycles with position buffers preventing export
    GPUPE_SQ_INTERP_QUADS = 49, // Number of interpolated quads. This is number of quads * number of parameters.
    GPUPE_SQ_INTERP_ACTIVE = 50, // Number of active interpolation cycles. Optimaly one cycle should interpolate 4 quads for 1 parameter.
    GPUPE_SQ_IN_PIXEL_STALL = 51, // Number of cycles a ready to go pixel vector stalled because of port arbitration to the GPRs
    GPUPE_SQ_IN_VTX_STALL = 52, // Number of cycles a ready to go vertex vector stalled because of port arbitration to the GPRs
    GPUPE_SQ_VTX_CNT = 53, // Number of individual vertexes sent to the sequencer
    GPUPE_SQ_VTX_16_VECTOR = 54, // Number of vertex vectors with less than 16 vertexes
    GPUPE_SQ_VTX_32_VECTOR = 55, // Number of vertex vectors with less than 32 vertexes
    GPUPE_SQ_VTX_48_VECTOR = 56, // Number of vertex vectors with less than 48 vertexes
    GPUPE_SQ_PIXEL_16_VECTOR = 57, // Number of pixel vectors with less than 16 pixels
    GPUPE_SQ_PIXEL_32_VECTOR = 58, // Number of pixel vectors with less than 32 pixels
    GPUPE_SQ_PIXEL_48_VECTOR = 59, // Number of pixel vectors with less than 48 pixels
    GPUPE_SQ_CONSTANTS_USED_SIMD1 = 60, // Number of ALU constants used for SIMD1. Decodes the instruction to count how many constants there are (1,2,3)
    GPUPE_SQ_CONSTANTS_SENT_SP_SIMD1 = 61, // Number of ALU constants sent to the SP for SIMD1. This is the number of reads to the constant store. If greather than SQ_CONSTANTS_USED, an instruction was recirculated because of waterfaling.
    GPUPE_SQ_SX_MEM_EXP_FULL = 62, // always zero.
    GPUPE_SQ_ALU0_ACTIVE_VTX_SIMD2 = 63, // Number of cycles ALU 0 is active (vertex) for SIMD2
    GPUPE_SQ_ALU1_ACTIVE_VTX_SIMD2 = 64, // Number of cycles ALU 1 is active (vertex) for SIMD2
    GPUPE_SQ_ALU0_ACTIVE_PIX_SIMD2 = 65, // Number of cycles ALU 0 is active (pixel) for SIMD2
    GPUPE_SQ_ALU1_ACTIVE_PIX_SIMD2 = 66, // Number of cycles ALU 1 is active (pixel) for SIMD2
    GPUPE_SQ_ALU0_ACTIVE_VTX_SIMD3 = 67, // Number of cycles ALU 0 is active (vertex) for SIMD3
    GPUPE_SQ_ALU1_ACTIVE_VTX_SIMD3 = 68, // Number of cycles ALU 1 is active (vertex) for SIMD3
    GPUPE_SQ_ALU0_ACTIVE_PIX_SIMD3 = 69, // Number of cycles ALU 0 is active (pixel) for SIMD3
    GPUPE_SQ_ALU1_ACTIVE_PIX_SIMD3 = 70, // Number of cycles ALU 1 is active (pixel) for SIMD3
    GPUPE_SQ_ALU0_FIFO_EMPTY_SIMD2 = 71, // Number of clocks ALU 0 FIFO was empty (busy with control flow) for SIMD2, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU1_FIFO_EMPTY_SIMD2 = 72, // Number of clocks ALU 1 FIFO was empty (busy with control flow) for SIMD2, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU0_FIFO_EMPTY_SIMD3 = 73, // Number of clocks ALU 0 FIFO was empty (busy with control flow) for SIMD3, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_ALU1_FIFO_EMPTY_SIMD3 = 74, // Number of clocks ALU 1 FIFO was empty (busy with control flow) for SIMD3, only counts when there is a thread in any of pixel or vertex RS
    GPUPE_SQ_SYNC_ALU_STALL_SIMD2_VTX = 75, // Number of cycles all vertex threads are blocked because of synchronization (Alu SIMD2) This is all threads that want to go to Alu SIMD2 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD3_VTX = 76, // Number of cycles all vertex threads are blocked because of synchronization (Alu SIMD3) This is all threads that want to go to Alu SIMD3 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD0_PIX = 77, // Number of cycles all pixel threads are blocked because of synchronization (Alu SIMD0) This is all threads that want to go to Alu SIMD0 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD1_PIX = 78, // Number of cycles all pixel threads are blocked because of synchronization (Alu SIMD1) This is all threads that want to go to Alu SIMD1 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD2_PIX = 79, // Number of cycles all pixel threads are blocked because of synchronization (Alu SIMD2) This is all threads that want to go to Alu SIMD2 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_ALU_STALL_SIMD3_PIX = 80, // Number of cycles all pixel threads are blocked because of synchronization (Alu SIMD3) This is all threads that want to go to Alu SIMD3 but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_TEX_STALL_PIX = 81, // Number of cycles all pixel threads are blocked because of synchronization (Texture) This is all threads that want to go to the TP but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_SYNC_VC_STALL_PIX = 82, // Number of cycles all pixel threads are blocked because of synchronization (VC) This is all threads that want to go to the VC but cannot because of the VC,TP or alloc resource.
    GPUPE_SQ_CONSTANTS_USED_SIMD2 = 83, // Number of ALU constants used for SIMD2. Decodes the instruction to count how many constants there are (1,2,3)
    GPUPE_SQ_CONSTANTS_SENT_SP_SIMD2 = 84, // Number of ALU constants sent to the SP for SIMD2. This is the number of reads to the constant store. If greather than SQ_CONSTANTS_USED, an instruction was recirculated because of waterfaling.
    GPUPE_SQ_CONSTANTS_USED_SIMD3 = 85, // Number of ALU constants used for SIMD3. Decodes the instruction to count how many constants there are (1,2,3)
    GPUPE_SQ_CONSTANTS_SENT_SP_SIMD3 = 86, // Number of ALU constants sent to the SP for SIMD3. This is the number of reads to the constant store. If greather than SQ_CONSTANTS_USED, an instruction was recirculated because of waterfaling.
    GPUPE_SQ_ALU0_FIFO_FULL_SIMD0 = 87, // Number of cycles ALU 0 FIFO was full for SIMD0
    GPUPE_SQ_ALU1_FIFO_FULL_SIMD0 = 88, // Number of cycles ALU 1 FIFO was full for SIMD0
    GPUPE_SQ_ALU0_FIFO_FULL_SIMD1 = 89, // Number of cycles ALU 0 FIFO was full for SIMD1
    GPUPE_SQ_ALU1_FIFO_FULL_SIMD1 = 90, // Number of cycles ALU 1 FIFO was full for SIMD1
    GPUPE_SQ_ALU0_FIFO_FULL_SIMD2 = 91, // Number of cycles ALU 0 FIFO was full for SIMD2
    GPUPE_SQ_ALU1_FIFO_FULL_SIMD2 = 92, // Number of cycles ALU 1 FIFO was full for SIMD2
    GPUPE_SQ_ALU0_FIFO_FULL_SIMD3 = 93, // Number of cycles ALU 0 FIFO was full for SIMD3
    GPUPE_SQ_ALU1_FIFO_FULL_SIMD3 = 94, // Number of cycles ALU 1 FIFO was full for SIMD3
    GPUPE_VC_PERF_STATIC = 95, // Number of cycles the VC is not ready to receive anything and the SQ is not ready to send anything
    GPUPE_VC_PERF_STALLED = 96, // Number of cycles the VC is not ready to receive anything but the SQ is ready to send something
    GPUPE_VC_PERF_STARVED = 97, // Number of cycles the VC is ready to receive something but the SQ is not ready to send anything
    GPUPE_VC_PERF_SEND = 98, // Number of cycles the VC is ready to receive something and the SQ is is ready to send something
    GPUPE_VC_PERF_ACTUAL_STARVED = 99, // Number of cycles the VC is starved
    GPUPE_PIXEL_THREAD_0_ACTIVE  = 100, // Number of cycles the thread 0 of pixel is active
    GPUPE_VERTEX_THREAD_0_ACTIVE     = 101, // Number of cycles the thread 0 of vertex is active
    GPUPE_PIXEL_THREAD_0_NUMBER  = 102, // Number of times the thread 0 of pixel is active
    GPUPE_VERTEX_THREAD_0_NUMBER     = 103, // Number of times the thread 0 of vertex is active
    GPUPE_VERTEX_EVENT_NUMBER    = 104, // Number of events sent in the vertex thread buffer
    GPUPE_PIXEL_EVENT_NUMBER     = 105, // Number of events sent in the pixel thread buffer
} GPUPERFEVENT_SQ;

typedef enum
{
    GPUPE_VGT_SQ_EVENT_WINDOW_ACTIVE = 0,
    GPUPE_VGT_SQ_SEND = 1,
    GPUPE_VGT_SQ_STALLED = 2,
    GPUPE_VGT_SQ_STARVED_BUSY = 3,
    GPUPE_VGT_SQ_STARVED_IDLE = 4,
    GPUPE_VGT_SQ_STATIC = 5,
    GPUPE_VGT_PA_EVENT_WINDOW_ACTIVE = 6,
    GPUPE_VGT_PA_CLIP_V_SEND = 7,
    GPUPE_VGT_PA_CLIP_V_STALLED = 8,
    GPUPE_VGT_PA_CLIP_V_STARVED_BUSY = 9,
    GPUPE_VGT_PA_CLIP_V_STARVED_IDLE = 10,
    GPUPE_VGT_PA_CLIP_V_STATIC = 11,
    GPUPE_VGT_PA_CLIP_P_SEND = 12,
    GPUPE_VGT_PA_CLIP_P_STALLED = 13,
    GPUPE_VGT_PA_CLIP_P_STARVED_BUSY = 14,
    GPUPE_VGT_PA_CLIP_P_STARVED_IDLE = 15,
    GPUPE_VGT_PA_CLIP_P_STATIC = 16,
    GPUPE_VGT_PA_CLIP_S_SEND = 17,
    GPUPE_VGT_PA_CLIP_S_STALLED = 18,
    GPUPE_VGT_PA_CLIP_S_STARVED_BUSY = 19,
    GPUPE_VGT_PA_CLIP_S_STARVED_IDLE = 20,
    GPUPE_VGT_PA_CLIP_S_STATIC = 21,
    GPUPE_RBIU_FIFOS_EVENT_WINDOW_ACTIVE = 22,
    GPUPE_RBIU_IMMED_DATA_FIFO_STARVED = 23,
    GPUPE_RBIU_IMMED_DATA_FIFO_STALLED = 24,
    GPUPE_RBIU_DMA_REQUEST_FIFO_STARVED = 25,
    GPUPE_RBIU_DMA_REQUEST_FIFO_STALLED = 26,
    GPUPE_RBIU_DRAW_INITIATOR_FIFO_STARVED = 27,
    GPUPE_RBIU_DRAW_INITIATOR_FIFO_STALLED = 28,
    GPUPE_SPARE29 = 29,
    GPUPE_SPARE30 = 30,
    GPUPE_SPARE31 = 31,
    GPUPE_SPARE32 = 32,
    GPUPE_SPARE33 = 33,
    GPUPE_SPARE34 = 34,
    GPUPE_SPARE35 = 35,
    GPUPE_SPARE36 = 36,
    GPUPE_SPARE37 = 37,
    GPUPE_SPARE38 = 38,
    GPUPE_SPARE39 = 39,
    GPUPE_TE_SU_IN_VALID = 40,
    GPUPE_TE_SU_IN_READ = 41,
    GPUPE_TE_SU_IN_PRIM = 42,
    GPUPE_TE_SU_IN_EOP = 43,
    GPUPE_TE_SU_IN_NULL_PRIM = 44,
    GPUPE_TE_WK_IN_VALID = 45,
    GPUPE_TE_WK_IN_READ = 46,
    GPUPE_TE_OUT_PRIM_VALID = 47,
    GPUPE_TE_OUT_PRIM_READ = 48,
} GPUPERFEVENT_VGT;

typedef enum
{
    GPUPE_RG_VERTICES = 0, // Number of vertices processed by RG
    GPUPE_RG_CLAMPED = 1, // Number of vertices clamped by RG
    GPUPE_RG_L2_REQUEST = 2, // Count of L2 Requests created by RG
    GPUPE_RG_L1_REQUEST = 3, // Count of L1 Requests created by RG
    GPUPE_RG_MEGAFETCH = 4, // Count of megafetches processed by RG
    GPUPE_RG_END_OF_GROUP = 5, // Count of end_of_group signals received from SQ
    GPUPE_RG_CONFLICT = 6, // Number of vertex pairs that conflict in the L2
    GPUPE_RG_DWORDS_REQUESTED = 7, // Number of dword requests created by RG
    GPUPE_RG_SPARE0 = 8, // Reserved for RG
    GPUPE_RG_SPARE1 = 9, // Reserved for RG
    GPUPE_CC_STALLS = 10, // Number of clocks CC stalled due to memory latency
    GPUPE_CC_HITS = 11, // L2 Requests that resulted in a cache hit
    GPUPE_CC_MISSES = 12, // L2 Requests that resulted in a cache miss
    GPUPE_CC_SECTOR_MISSES = 13, // L2 Requests that resulted in a sector miss
    GPUPE_CC_L2B_STALLS = 14, // CC stalled due to L2B FIFO not RTR
    GPUPE_CC_MI_STALLS = 15, // CC stalled due to MI not RTR
    GPUPE_CC_MULTICYCLE_STALLS = 16, // CC stalled due to multicycle
    GPUPE_CC_EVEN_ALLOC_STALLS = 17, // CC stalled due to even allocation stall
    GPUPE_CC_ODD_ALLOC_STALLS = 18, // CC stalled due to odd allocation stall
    GPUPE_CC_EVEN_BUSY_STALLS = 19, // CC stalled due to even counter busy
    GPUPE_CC_ODD_BUSY_STALLS = 20, // CC stalled due to odd counter busy
    GPUPE_CC_IN_FIFO_EMPTY = 21, // CC input FIFO is empty
    GPUPE_CC_IN_FIFO_FULL = 22, // CC input FIFO is full
    GPUPE_CC_FREEZE = 23, // CC frozen due to one of the stall conditions
    GPUPE_CC_SPARE0 = 24, // Reserved for CC
    GPUPE_CC_SPARE1 = 25, // Reserved for CC
    GPUPE_CC_SPARE2 = 26, // Reserved for CC
    GPUPE_CC_SPARE3 = 27, // Reserved for CC
    GPUPE_CC_SPARE4 = 28, // Reserved for CC
    GPUPE_CC_SPARE5 = 29, // Reserved for CC
    GPUPE_MI_REQUESTS_TO_MH = 30, // Number of memory requests issued to memory hub
    GPUPE_MI_AGP_REQUESTS = 31, // Number of AGP memory requests issued
    GPUPE_MI_LATENCY_BITS_4_0    = 32, // 4:0 of the latency count
    GPUPE_MI_LATENCY_BITS_9_5 = 33, // Bits 9:5 of the latency count
    GPUPE_MI_LATENCY_BITS_14_10 = 34, // Bits 14:10 of the latency count
    GPUPE_MI_LATENCY_BITS_17_15 = 35, // Bits 17:15 of the latency count
    GPUPE_MI_INPUT_FIFOS_FULL = 36, // Number of cycles any of the four Input FIFOs to the MI requestor are full
    GPUPE_MI_INPUT_FIFOS_0_FULL = 37, // Number of cycles any of the two Input FIFOs on request path 0 to the MI requestor are full
    GPUPE_MI_INPUT_FIFOS_1_FULL = 38, // Number of cycles any of the two Input FIFOs on request path 1 to the MI requestor are full
    GPUPE_MI_SPARE1 = 39, // Reserved for MI
    GPUPE_RP_SP_DATA_VALID = 40, // Number of clocks of valid data returned to SP
    GPUPE_RP_STALLED = 41, // Number of clocks RP stalled due to TC use of phase
    GPUPE_RP_SPARE0 = 42, // Reserved for RP
    GPUPE_RP_SPARE1 = 43, // Reserved for RP
    GPUPE_RP_SPARE2 = 44, // Reserved for RP
    GPUPE_RP_SPARE3 = 45, // Reserved for RP
    GPUPE_RP_SPARE4 = 46, // Reserved for RP
    GPUPE_RP_SPARE5 = 47, // Reserved for RP
    GPUPE_RP_SPARE6 = 48, // Reserved for RP
    GPUPE_RP_SPARE7 = 49, // Reserved for RP
    GPUPE_DC_NUM_VALIDS = 50, // Number of valid vectors returned to the SP
    GPUPE_DC_SPARE0 = 51, // Reserved for DC
    GPUPE_DC_SPARE1 = 52, // Reserved for DC
    GPUPE_DC_SPARE2 = 53, // Reserved for DC
    GPUPE_DC_SPARE3 = 54, // Reserved for DC
    GPUPE_DC_SPARE4 = 55, // Reserved for DC
    GPUPE_DC_SPARE5 = 56, // Reserved for DC
    GPUPE_DC_SPARE6 = 57, // Reserved for DC
    GPUPE_DC_SPARE7 = 58, // Reserved for DC
    GPUPE_DC_SPARE8 = 59, // Reserved for DC
    GPUPE_SQ_VC_SEND = 60, // Number of clocks the SQ is passing data to the VC
    GPUPE_VC_STARVED_IDLE = 61, // Number of clock cycles the VC is idle and waiting for more data from the SQ
    GPUPE_VC_BUSY = 62, // Number of clock cycles the VC is busy processing data
    GPUPE_VC_IDLE = 63, // Number of clock cycles the VC is idle
    GPUPE_VC_SPARE0 = 64, // Reserved for VC
    GPUPE_VC_SPARE1 = 65, // Reserved for VC
    GPUPE_VC_SPARE2 = 66, // Reserved for VC
    GPUPE_VC_SPARE3 = 67, // Reserved for VC
    GPUPE_VC_SPARE4 = 68, // Reserved for VC
    GPUPE_VC_SPARE5 = 69, // Reserved for VC
} GPUPERFEVENT_VC;

typedef enum
{
    GPUPE_PERF_PAPC_PASX_REQ = 0, // Number of PA->SX requests
    GPUPE_PERF_PAPC_PASX_DISABLE_PIPE = 1, // Number of transfers lost due to disabled pipe
    GPUPE_PERF_PAPC_PASX_FIRST_VECTOR = 2, // Number of First Vectors from SX to PA
    GPUPE_PERF_PAPC_PASX_SECOND_VECTOR = 3, // Number of Second Vectors from SX to PA
    GPUPE_PERF_PAPC_PASX_FIRST_DEAD = 4, // Number of Unused First Vectors (due to granularity of 4)
    GPUPE_PERF_PAPC_PASX_SECOND_DEAD = 5, // Number of Unused Second Vectors (due to granularity of 4)
    GPUPE_PERF_PAPC_PASX_VTX_KILL_DISCARD = 6, // Number of vertices which have VTX KILL Enabled and Set
    GPUPE_PERF_PAPC_PASX_VTX_NAN_DISCARD = 7, // Number ov vertices which have NaN and corresponding NaN discard
    GPUPE_PERF_PAPC_PA_INPUT_PRIM = 8, // Number of Primitives input to PA
    GPUPE_PERF_PAPC_PA_INPUT_NULL_PRIM = 9, // Number of Null Primitives input to PA
    GPUPE_PERF_PAPC_PA_INPUT_EVENT_FLAG = 10, // Number of Events input to PA
    GPUPE_PERF_PAPC_PA_INPUT_FIRST_PRIM_SLOT = 11, // Number of First-Prim-Of-Slots input to PA
    GPUPE_PERF_PAPC_PA_INPUT_END_OF_PACKET = 12, // Number of End-Of-Packets input to PA
    GPUPE_PERF_PAPC_CLPR_CULL_PRIM = 13, // Number of Prims Culled by Clipper for VV, UCP, VTX_KILL, VTX_NAN
    GPUPE_PERF_PAPC_CLPR_VVUCP_CULL_PRIM = 14, // Number of Prims Culled by Clipper for VV and UCP
    GPUPE_PERF_PAPC_CLPR_VV_CULL_PRIM = 15, // Number of Prims Culled by Clipper for VV
    GPUPE_PERF_PAPC_CLPR_UCP_CULL_PRIM = 16, // Number of Prims Culled by Clipper for UCP
    GPUPE_PERF_PAPC_CLPR_VTX_KILL_CULL_PRIM = 17, // Number of Prims Culled by Clipper for VTX_KILL
    GPUPE_PERF_PAPC_CLPR_VTX_NAN_CULL_PRIM = 18, // Number of Prims Culled by Clipper for VTX_NAN
    GPUPE_PERF_PAPC_CLPR_CULL_TO_NULL_PRIM = 19, // Number of Clipper Culled Prims Retained for Pipe Info
    GPUPE_PERF_PAPC_CLPR_VVUCP_CLIP_PRIM = 20, // Number of Prims Clipped by Clipper for VV and/or UCP
    GPUPE_PERF_PAPC_CLPR_VV_CLIP_PRIM = 21, // Number of Prims Clipped by Clipper for VV
    GPUPE_PERF_PAPC_CLPR_UCP_CLIP_PRIM = 22, // Number of Prims Clipped by Clipper for UCP
    GPUPE_PERF_PAPC_CLPR_POINT_CLIP_CANDIDATE = 23, // Number of Points which require detailed clip checked
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_1 = 24, // Number of Prims with 1 Clip Plane Intersection (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_2 = 25, // Number of Prims with 2 Clip Plane Intersections (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_3 = 26, // Number of Prims with 3 Clip Plane Intersections (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_4 = 27, // Number of Prims with 4 Clip Plane Intersections (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_5_8 = 28, // Number of Prims with 5-8 Clip Plane Intersections (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_CNT_9_12 = 29, // Number of Prims with 9-12 Clip Plane Intersections (includes VV and UCP)
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_NEAR = 30, // Number of Prims which intersect the NEAR   VV Plane
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_FAR = 31, // Number of Prims which intersect the FAR    VV Plane
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_LEFT = 32, // Number of Prims which intersect the LEFT   VV Plane
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_RIGHT = 33, // Number of Prims which intersect the RIGHT  VV Plane
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_TOP = 34, // Number of Prims which intersect the TOP    VV Plane
    GPUPE_PERF_PAPC_CLPR_CLIP_PLANE_BOTTOM = 35, // Number of Prims which intersect the BOTTOM VV Plane
    GPUPE_PERF_PAPC_CLSM_NULL_PRIM = 36, // Number of null primitives at Clip State Machine pipe stage
    GPUPE_PERF_PAPC_CLSM_TOTALLY_VISIBLE_PRIM = 37, // Number of totally visible (no-clipping) prims
    GPUPE_PERF_PAPC_CLSM_CLIP_PRIM = 38, // UNUSED
    GPUPE_PERF_PAPC_CLSM_CULL_TO_NULL_PRIM = 39, // Number of primitives which are culled during clip process
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_1 = 40, // Number of primitives which were clipped and result in 1 primitive
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_2 = 41, // Number of primitives which were clipped and result in 2 primitives
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_3 = 42, // Number of primitives which were clipped and result in 3 primitives
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_4 = 43, // Number of primitives which were clipped and result in 4 primitives
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_5_8 = 44, // Number of primitives which were clipped and result in 5-8 primitives
    GPUPE_PERF_PAPC_CLSM_OUT_PRIM_CNT_9_13 = 45, // Number of primitives which were clipped and result in 9-13 primitives
    GPUPE_PERF_PAPC_CLSM_NON_TRIVIAL_CULL = 46, // UNUSED
    GPUPE_PERF_PAPC_SU_INPUT_PRIM = 47, // Number of primitives input to the Setup block
    GPUPE_PERF_PAPC_SU_INPUT_CLIP_PRIM = 48, // Number of clipped primitives input to the Setup block
    GPUPE_PERF_PAPC_SU_INPUT_NULL_PRIM = 49, // Number of null primitives input to the Setup block
    GPUPE_PERF_PAPC_SU_ZERO_AREA_CULL_PRIM = 50, // Number of primitives culled due to zero area
    GPUPE_PERF_PAPC_SU_BACK_FACE_CULL_PRIM = 51, // Number of back-face primitives culled due to facedness
    GPUPE_PERF_PAPC_SU_FRONT_FACE_CULL_PRIM = 52, // Number of front-face primitives culled due to facedness
    GPUPE_PERF_PAPC_SU_POLYMODE_FACE_CULL = 53, // Number of polymode cull-determination primitives culled
    GPUPE_PERF_PAPC_SU_POLYMODE_BACK_CULL = 54, // Number of polymode primitives discarded due to Back-Face Cull
    GPUPE_PERF_PAPC_SU_POLYMODE_FRONT_CULL = 55, // Number of polymode primitives discarded due to Front-Face Cull
    GPUPE_PERF_PAPC_SU_POLYMODE_INVALID_FILL = 56, // Number of polymode lines and/or points which are culled because they are an internal edge or point
    GPUPE_PERF_PAPC_SU_OUTPUT_PRIM = 57, // Number of primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_CLIP_PRIM = 58, // Number of clipped primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_NULL_PRIM = 59, // Number of null primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_EVENT_FLAG = 60, // Number of events output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_FIRST_PRIM_SLOT = 61, // Number of First-Prim-Of-Slots output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_END_OF_PACKET = 62, // Number of End-Of-Packets output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_POLYMODE_FACE = 63, // Number of polymode facing primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_POLYMODE_BACK = 64, // Number of polymode back-face primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUTPUT_POLYMODE_FRONT = 65, // Number of polymode front-face primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUT_CLIP_POLYMODE_FACE = 66, // Number of clipped polymode facing primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUT_CLIP_POLYMODE_BACK = 67, // Number of clipped polymode back-face primitives output from the Setup block
    GPUPE_PERF_PAPC_SU_OUT_CLIP_POLYMODE_FRONT = 68, // Number of clipped polymode front-face primitives output from the Setup block
    GPUPE_PERF_PAPC_PASX_REQ_IDLE = 69, // Number of clocks PASX Requestor is Idle
    GPUPE_PERF_PAPC_PASX_REQ_BUSY = 70, // Number of clocks PASX Requestor is Busy
    GPUPE_PERF_PAPC_PASX_REQ_STALLED = 71, // Number of clocks PASX Requestor is Stalled
    GPUPE_PERF_PAPC_PASX_REC_IDLE = 72, // Number of clocks PASX Receiver is Idle
    GPUPE_PERF_PAPC_PASX_REC_BUSY = 73, // Number of clocks PASX Receiver is Busy
    GPUPE_PERF_PAPC_PASX_REC_STARVED_SX = 74, // Number of clocks PASX Receiver is Stalled by SX
    GPUPE_PERF_PAPC_PASX_REC_STALLED = 75, // Number of clocks PASX Reciever is Stalled by Position Memory or Clip Code Generator
    GPUPE_PERF_PAPC_PASX_REC_STALLED_POS_MEM = 76, // Number of clocks PASX Reciever is Stalled by Position Memory
    GPUPE_PERF_PAPC_PASX_REC_STALLED_CCGSM_IN = 77, // Number of clocks PASX Reciever is Stalled by Clip Code Generator
    GPUPE_PERF_PAPC_CCGSM_IDLE = 78, // Number of clocks Clip Code Gen is Idle
    GPUPE_PERF_PAPC_CCGSM_BUSY = 79, // Number of clocks Clip Code Gen is Busy
    GPUPE_PERF_PAPC_CCGSM_STALLED = 80, // Number of clocks Clip Code Gen is Stalled
    GPUPE_PERF_PAPC_CLPRIM_IDLE = 81, // Number of clocks Clip Primitive Machine is Idle
    GPUPE_PERF_PAPC_CLPRIM_BUSY = 82, // Number of clocks Clip Primitive Machine is Busy
    GPUPE_PERF_PAPC_CLPRIM_STALLED = 83, // Number of clocks Clip Primitive Machine is stalled by Clip State Machines
    GPUPE_PERF_PAPC_CLPRIM_STARVED_CCGSM = 84, // Number of clocks Clip Primitive Machine is starved by Clip Code Generator
    GPUPE_PERF_PAPC_CLIPSM_IDLE = 85, // Number of clocks Clip State Machines are Idle
    GPUPE_PERF_PAPC_CLIPSM_BUSY = 86, // Number of clocks Clip State Machines are Busy
    GPUPE_PERF_PAPC_CLIPSM_WAIT_CLIP_VERT_ENGH = 87, // Number of clocks Clip State Mahcines are waiting for Clip Vert storage resources
    GPUPE_PERF_PAPC_CLIPSM_WAIT_HIGH_PRI_SEQ = 88, // Number of clocks Clip State Machines are waiting for High Priority Sequencer
    GPUPE_PERF_PAPC_CLIPSM_WAIT_CLIPGA = 89, // Number of clocks Clip State Machines are waiting for ClipGA
    GPUPE_PERF_PAPC_CLIPSM_WAIT_AVAIL_VTE_CLIP = 90, // Number of clocks Clip State Machines are waiting for VTE cycles
    GPUPE_PERF_PAPC_CLIPSM_WAIT_CLIP_OUTSM = 91, // Number of clocks Clip State Machines are waiting for Clip Output State Machine
    GPUPE_PERF_PAPC_CLIPGA_IDLE = 92, // Number of clocks Clip Ga is Idle
    GPUPE_PERF_PAPC_CLIPGA_BUSY = 93, // Number of clocks Clip Ga is Busy
    GPUPE_PERF_PAPC_CLIPGA_STARVED_VTE_CLIP = 94, // Number of clocks Clip Ga is Starved by VTE or Clipper
    GPUPE_PERF_PAPC_CLIPGA_STALLED = 95, // Number of clocks Clip Ga is stalled
    GPUPE_PERF_PAPC_CLIP_IDLE = 96, // Number of clocks Clip is Idle
    GPUPE_PERF_PAPC_CLIP_BUSY = 97, // Number of clocks Clip is Busy
    GPUPE_PERF_PAPC_SU_IDLE = 98, // Number of clocks Setup is Idle
    GPUPE_PERF_PAPC_SU_BUSY = 99, // Number of clocks Setup is Busy
    GPUPE_PERF_PAPC_SU_STARVED_CLIP = 100, // Number of clocks Setup is starved by Clipper
    GPUPE_PERF_PAPC_SU_STALLED_SC = 101, // Number of clocks Setup is stalled by SC
} GPUPERFEVENT_PA_SU;

typedef enum
{
    GPUPE_SC_SR_WINDOW_VALID = 0, // Number of clocks event-window is valid at stage_reg
    GPUPE_SC_CW_WINDOW_VALID = 1, // Number of clocks event-window is valid at coarse_walker
    GPUPE_SC_QM_WINDOW_VALID = 2, // Number of clocks event-window is valid at quadmask
    GPUPE_SC_QPP_WINDOW_VALID = 3, // Number of clocks event-window is valid at quad pair proc
    GPUPE_SC_ITER_WINDOW_VALID = 4, // Number of clocks event-window is valid at iter
    GPUPE_SC_STARVED_BY_PA = 5, // sc_rtr and not pa_rts and sc_busy
    GPUPE_SC_STARVED_BY_RCC = 6, // sc_tile_fifo full, rcc z return fifo empty
    GPUPE_SC_STALLED_BY_PRIM_FF = 7, // sc primitive fifo full is causing a stall
    GPUPE_SC_STALLED_BY_RCC = 8, // sc_rts and not rcc_rtr
    GPUPE_SC_STALLED_BY_BC = 9, // sc_rts and not bc_rtr
    GPUPE_SC_STALLED_BY_SX = 10, // sc_rts and not (sx0_rtr and sx1_rtr)
    GPUPE_SC_STALLED_BY_SX0 = 11, // sc_rts and not sx0_rtr
    GPUPE_SC_STALLED_BY_SX1 = 12, // sc_rts and not sx1_rtr
    GPUPE_SC_STALLED_BY_SQ = 13, // sc_sq count >= max
    GPUPE_SC_STALLED_BY_SP = 14, // sc_sp count >= max
    GPUPE_SC_WAIT_FOR_R1 = 15, // qpp has a single quad and is waiting for another tile for grouping
    GPUPE_SC_SCISSOR_DISCARD = 16, // prim completely discarded by scissor
    GPUPE_SC_BB_DISCARD = 17, // prim discarded by bounding-box check, no pixels hit
    GPUPE_SC_SUPERT_PRIM_DISCARD = 18, // prim completely discarded by super_tile optimization
    GPUPE_SC_RT_PRIM = 19, // real-time prim count
    GPUPE_SC_TILE_VALID = 20, // tile count
    GPUPE_SC_TILE_PER_PRIM_H0 = 21, // prims with < 2   tiles
    GPUPE_SC_TILE_PER_PRIM_H1 = 22, // prims with < 4   tiles
    GPUPE_SC_TILE_PER_PRIM_H2 = 23, // prims with < 8   tiles
    GPUPE_SC_TILE_PER_PRIM_H3 = 24, // prims with < 16  tiles
    GPUPE_SC_TILE_PER_PRIM_H4 = 25, // prims with < 32  tiles
    GPUPE_SC_TILE_PER_PRIM_H5 = 26, // prims with < 64  tiles
    GPUPE_SC_TILE_PER_PRIM_H6 = 27, // prims with < 128 tiles
    GPUPE_SC_TILE_PER_PRIM_H7 = 28, // prims with < 256 tiles
    GPUPE_SC_TILE_PER_PRIM_H8 = 29, // prims with < 512 tiles
    GPUPE_SC_TILE_PER_PRIM_H9 = 30, // prims with < 1K  tiles
    GPUPE_SC_TILE_PER_PRIM_H10 = 31, // prims with < 2K  tiles
    GPUPE_SC_TILE_PER_PRIM_H11 = 32, // prims with < 4K  tiles
    GPUPE_SC_TILE_PER_PRIM_H12 = 33, // prims with < 8K  tiles
    GPUPE_SC_TILE_PER_PRIM_H13 = 34, // prims with < 16K tiles
    GPUPE_SC_TILE_PER_PRIM_H14 = 35, // prims with < 32K tiles
    GPUPE_SC_TILE_PER_PRIM_H15 = 36, // prims with < 64K tiles
    GPUPE_SC_TILE_PER_PRIM_H16 = 37, // prims with < 1M  tiles
    GPUPE_SC_SUPERT_TILE_DISCARD = 38, // tiles discarded by super_tile optimization
    GPUPE_SC_QM_NUM_QUADS = 39, // total quads hit by coarsewalk
    GPUPE_SC_QM_MASK_H0 = 40, // tiles walked with 0  quads hit
    GPUPE_SC_QM_MASK_H1 = 41, // tiles walked with 1  quads hit
    GPUPE_SC_QM_MASK_H2 = 42, // tiles walked with 2  quads hit
    GPUPE_SC_QM_MASK_H3 = 43, // tiles walked with 3  quads hit
    GPUPE_SC_QM_MASK_H4 = 44, // tiles walked with 4  quads hit
    GPUPE_SC_QM_MASK_H5 = 45, // tiles walked with 5  quads hit
    GPUPE_SC_QM_MASK_H6 = 46, // tiles walked with 6  quads hit
    GPUPE_SC_QM_MASK_H7 = 47, // tiles walked with 7  quads hit
    GPUPE_SC_QM_MASK_H8 = 48, // tiles walked with 8  quads hit
    GPUPE_SC_QM_MASK_H9 = 49, // tiles walked with 9  quads hit
    GPUPE_SC_QM_MASK_H10 = 50, // tiles walked with 10 quads hit
    GPUPE_SC_QM_MASK_H11 = 51, // tiles walked with 11 quads hit
    GPUPE_SC_QM_MASK_H12 = 52, // tiles walked with 12 quads hit
    GPUPE_SC_QM_MASK_H13 = 53, // tiles walked with 13 quads hit
    GPUPE_SC_QM_MASK_H14 = 54, // tiles walked with 14 quads hit
    GPUPE_SC_QM_MASK_H15 = 55, // tiles walked with 15 quads hit
    GPUPE_SC_QM_MASK_H16 = 56, // tiles walked with 16 quads hit
    GPUPE_SC_QM_COVERED_H0 = 57, // tiles walked with 0  quads covered
    GPUPE_SC_QM_COVERED_H1 = 58, // tiles walked with 1  quads covered
    GPUPE_SC_QM_COVERED_H2 = 59, // tiles walked with 2  quads covered
    GPUPE_SC_QM_COVERED_H3 = 60, // tiles walked with 3  quads covered
    GPUPE_SC_QM_COVERED_H4 = 61, // tiles walked with 4  quads covered
    GPUPE_SC_QM_COVERED_H5 = 62, // tiles walked with 5  quads covered
    GPUPE_SC_QM_COVERED_H6 = 63, // tiles walked with 6  quads covered
    GPUPE_SC_QM_COVERED_H7 = 64, // tiles walked with 7  quads covered
    GPUPE_SC_QM_COVERED_H8 = 65, // tiles walked with 8  quads covered
    GPUPE_SC_QM_COVERED_H9 = 66, // tiles walked with 9  quads covered
    GPUPE_SC_QM_COVERED_H10 = 67, // tiles walked with 10 quads covered
    GPUPE_SC_QM_COVERED_H11 = 68, // tiles walked with 11 quads covered
    GPUPE_SC_QM_COVERED_H12 = 69, // tiles walked with 12 quads covered
    GPUPE_SC_QM_COVERED_H13 = 70, // tiles walked with 13 quads covered
    GPUPE_SC_QM_COVERED_H14 = 71, // tiles walked with 14 quads covered
    GPUPE_SC_QM_COVERED_H15 = 72, // tiles walked with 15 quads covered
    GPUPE_SC_QM_COVERED_H16 = 73, // tiles walked with 16 quads covered
    GPUPE_SC_HIER_NUM_QUADS = 74, // total quads surviving z
    GPUPE_SC_HIER_MASK_H0 = 75, // tiles with 0  quads surviving z
    GPUPE_SC_HIER_MASK_H1 = 76, // tiles with 1  quads surviving z
    GPUPE_SC_HIER_MASK_H2 = 77, // tiles with 2  quads surviving z
    GPUPE_SC_HIER_MASK_H3 = 78, // tiles with 3  quads surviving z
    GPUPE_SC_HIER_MASK_H4 = 79, // tiles with 4  quads surviving z
    GPUPE_SC_HIER_MASK_H5 = 80, // tiles with 5  quads surviving z
    GPUPE_SC_HIER_MASK_H6 = 81, // tiles with 6  quads surviving z
    GPUPE_SC_HIER_MASK_H7 = 82, // tiles with 7  quads surviving z
    GPUPE_SC_HIER_MASK_H8 = 83, // tiles with 8  quads surviving z
    GPUPE_SC_HIER_MASK_H9 = 84, // tiles with 9  quads surviving z
    GPUPE_SC_HIER_MASK_H10 = 85, // tiles with 10 quads surviving z
    GPUPE_SC_HIER_MASK_H11 = 86, // tiles with 11 quads surviving z
    GPUPE_SC_HIER_MASK_H12 = 87, // tiles with 12 quads surviving z
    GPUPE_SC_HIER_MASK_H13 = 88, // tiles with 13 quads surviving z
    GPUPE_SC_HIER_MASK_H14 = 89, // tiles with 14 quads surviving z
    GPUPE_SC_HIER_MASK_H15 = 90, // tiles with 15 quads surviving z
    GPUPE_SC_HIER_MASK_H16 = 91, // tiles with 16 quads surviving z
    GPUPE_SC_DET_NUM_QUADS = 92, // total quads surviving detail sampler
    GPUPE_SC_PKR_QD_PER_ROW_H1 = 93, // packer row outputs with 1 valid quad
    GPUPE_SC_PKR_QD_PER_ROW_H2 = 94, // packer row outputs with 2 valid quad
    GPUPE_SC_PKR_QD_PER_ROW_H3 = 95, // packer row outputs with 3 valid quad
    GPUPE_SC_PKR_QD_PER_ROW_H4 = 96, // packer row outputs with 4 valid quad
    GPUPE_SC_PKR_END_OF_VECTOR = 97, // number of pixel vectors
    GPUPE_SC_PKR_ONE_CLK = 98, // number of one clock commands
    GPUPE_SC_QD_WITH_1_PIX = 99, // quads with 1 pixel surviving detail
    GPUPE_SC_QD_WITH_2_PIX = 100, // quads with 2 pixels surviving detail
    GPUPE_SC_QD_WITH_3_PIX = 101, // quads with 3 pixels surviving detail
    GPUPE_SC_QD_WITH_4_PIX = 102, // quads with 4 pixels surviving detail
    GPUPE_SC_SR_WINDOW_VALID_BUSY = 103, // Event-window is valid at stage_reg with sc busy
    GPUPE_SC_CW_WINDOW_VALID_BUSY = 104, // Event-window is valid at coarse_walker with sc busy
    GPUPE_SC_QM_WINDOW_VALID_BUSY = 105, // Event-window is valid at quadmask with sc busy
    GPUPE_SC_QPP_WINDOW_VALID_BUSY = 106, // Event-window is valid at quad pair proc with sc busy
    GPUPE_SC_ITER_WINDOW_VALID_BUSY = 107, // Event-window is valid at iter with sc busy
} GPUPERFEVENT_PA_SC;

typedef enum
{
    GPUPE_WRC_1VAL_QUADS = 0, // Number of writes to hz_mem with 1 valid quad
    GPUPE_WRC_2VAL_QUADS = 1, // Number of writes to hz_mem with 2 valid quads
    GPUPE_WRC_3VAL_QUADS = 2, // Number of writes to hz_mem with 3 valid quads
    GPUPE_WRC_4VAL_QUADS = 3, // Number of writes to hz_mem with 4 valid quads
    GPUPE_HZ_WR_BUSY = 4, // Modules on the write-side of HZ memory are busy
    GPUPE_HZ_SPARE0 = 5, // Reserved for HZ
    GPUPE_RDC_TILE_HIT = 6, // Number of tile hits in the Read Cache
    GPUPE_RDC_STAGE3_STALL = 7, // Number of conflicts/stalls in Read Stage 3
    GPUPE_HZ_SPARE1 = 8, // Reserved for HZ
    GPUPE_HZ_SPARE2 = 9, // Reserved for HZ
    GPUPE_QUADS_KEPT = 10, // Number of quads kept
    GPUPE_QUADS_ZCULL = 11, // Number of quads culled due to Z only
    GPUPE_QUADS_SCULL = 12, // Number of quads culled due to Stencil only
    GPUPE_QUADS_SZCULL = 13, // Number of quads culled due to both Z and Stencil
    GPUPE_HZ_RE_BUSY = 14, // Modules on the read-side of HZ memory are busy
    GPUPE_BC_SCLK_COUNT = 15, // SCLK count for BC
    GPUPE_BC_HZ_VALID0 = 16, // Valid input data from BC bus 0
    GPUPE_BC_HZ_VALID1 = 17, // Valid input data from BC bus 1
    GPUPE_BC_HZ_VALID2 = 18, // Valid input data from BC bus 2
    GPUPE_BC_HZ_VALID3 = 19, // Valid input data from BC bus 3
    GPUPE_SC_SCLK_COUNT = 20, // SCLK count for SC
    GPUPE_SC_HZ_COARSE_SND = 21, // SC sending coarse Z data to HZ
    GPUPE_HZ_SC_COARSE_HLD = 22, // HZ holding SC from sending coarse Z data
    GPUPE_HZ_SC_HIER_SND = 23, // HZ sending mask data to BC
    GPUPE_SC_HZ_HIER_HLD = 24, // SC holding HZ from sending mask data
    GPUPE_HZ_BC_TILE_SND = 25, // HZ sending tile data to BC
    GPUPE_BC_HZ_TILE_HLD = 26, // BC holding HZ from sending tile data
    GPUPE_SC_STALL_HZ = 27, // HZ stalled by SC
    GPUPE_BC_STALL_HZ = 28, // HZ stalled by BC
    GPUPE_EVENT_STALL = 29, // HZ stalled by an Outstanding Event
    GPUPE_SC_STARVE_HZ_BUSY = 30, // HZ starved by the SC when the HZ is busy
    GPUPE_SC_STARVE_HZ_IDLE = 31, // HZ starved by the SC when the HZ is idle
} GPUPERFEVENT_HZ;

typedef enum
{
    // IPMUX receives memory sent by MH
    // It can stall if the dxt decompressor isn't ready.
    GPUPE_DGMMPD_IPMUX0_STALL = 0, // Number of clocks ipmux0 is stalled. Each tick means 16 bytes of Texture BW lost.
    GPUPE_DGMMPD_IPMUX1_STALL = 1, // Number of clocks ipmux1 is stalled. Each tick means 16 bytes of Texture BW lost.
    GPUPE_DGMMPD_IPMUX2_STALL = 2, // always 0
    GPUPE_DGMMPD_IPMUX3_STALL = 3, // always 0
    GPUPE_DGMMPD_IPMUX_ALL_STALL = 4, // always 0

    GPUPE_OPMUX0_L2_WRITES = 5, // Number of opmux0 L2 writes
    GPUPE_OPMUX1_L2_WRITES = 6, // Number of opmux1 L2 writes
    GPUPE_OPMUX2_L2_WRITES = 7, // Number of opmux2 L2 writes
    GPUPE_OPMUX3_L2_WRITES = 8, // Number of opmux3 L2 writes
} GPUPERFEVENT_TCR; // Texture Cache Return

typedef enum
{
    // TCO Read Latency Fifos (256-deep).
    GPUPE_QUAD0_RD_LAT_FIFO_EMPTY = 0, // Number of clocks Quad0 read latency fifo is empty
    GPUPE_QUAD0_RD_LAT_FIFO_16TH_FULL = 1, // Number of clocks Quad0 read latency fifo is at least 1/16th full, but less than 1/8th
    GPUPE_QUAD0_RD_LAT_FIFO_8TH_FULL = 2, // Number of clocks Quad0 read latency fifo is at least 1/8th full, but less than 1/4th
    GPUPE_QUAD0_RD_LAT_FIFO_4TH_FULL = 3, // Number of clocks Quad0 read latency fifo is at least 1/4th full, but less than half
    GPUPE_QUAD0_RD_LAT_FIFO_HALF_FULL = 4, // Number of clocks Quad0 read latency fifo is at least half full, but not full
    GPUPE_QUAD0_RD_LAT_FIFO_FULL = 5, // Number of clocks Quad0 read latency fifo is full
    GPUPE_QUAD0_RD_LAT_FIFO_LT_16TH_FULL = 6, // Number of clocks Quad0 read latency fifo is less than 1/16th full, but not empty
    GPUPE_QUAD1_RD_LAT_FIFO_EMPTY = 7, // Number of clocks Quad1 read latency fifo is empty
    GPUPE_QUAD1_RD_LAT_FIFO_16TH_FULL = 8, // Number of clocks Quad1 read latency fifo is at least 1/16th full, but less than 1/8th
    GPUPE_QUAD1_RD_LAT_FIFO_8TH_FULL = 9, // Number of clocks Quad1 read latency fifo is at least 1/8th full, but less than 1/4th
    GPUPE_QUAD1_RD_LAT_FIFO_4TH_FULL = 10, // Number of clocks Quad1 read latency fifo is at least 1/4th full, but less than half
    GPUPE_QUAD1_RD_LAT_FIFO_HALF_FULL = 11, // Number of clocks Quad1 read latency fifo is at least half full, but not full
    GPUPE_QUAD1_RD_LAT_FIFO_FULL = 12, // Number of clocks Quad1 read latency fifo is full
    GPUPE_QUAD1_RD_LAT_FIFO_LT_16TH_FULL = 13, // Number of clocks Quad1 read latency fifo is less than 1/16th full, but not empty
    GPUPE_QUAD2_RD_LAT_FIFO_EMPTY = 14, // Number of clocks Quad2 read latency fifo is empty
    GPUPE_QUAD2_RD_LAT_FIFO_16TH_FULL = 15, // Number of clocks Quad2 read latency fifo is at least 1/16th full, but less than 1/8th
    GPUPE_QUAD2_RD_LAT_FIFO_8TH_FULL = 16, // Number of clocks Quad2 read latency fifo is at least 1/8th full, but less than 1/4th
    GPUPE_QUAD2_RD_LAT_FIFO_4TH_FULL = 17, // Number of clocks Quad2 read latency fifo is at least 1/4th full, but less than half
    GPUPE_QUAD2_RD_LAT_FIFO_HALF_FULL = 18, // Number of clocks Quad2 read latency fifo is at least half full, but not full
    GPUPE_QUAD2_RD_LAT_FIFO_FULL = 19, // Number of clocks Quad2 read latency fifo is full
    GPUPE_QUAD2_RD_LAT_FIFO_LT_16TH_FULL = 20, // Number of clocks Quad2 read latency fifo is less than 1/16th full, but not empty
    GPUPE_QUAD3_RD_LAT_FIFO_EMPTY = 21, // Number of clocks Quad3 read latency fifo is empty
    GPUPE_QUAD3_RD_LAT_FIFO_16TH_FULL = 22, // Number of clocks Quad3 read latency fifo is at least 1/16th full, but less than 1/8th
    GPUPE_QUAD3_RD_LAT_FIFO_8TH_FULL = 23, // Number of clocks Quad3 read latency fifo is at least 1/8th full, but less than 1/4th
    GPUPE_QUAD3_RD_LAT_FIFO_4TH_FULL = 24, // Number of clocks Quad3 read latency fifo is at least 1/4th full, but less than half
    GPUPE_QUAD3_RD_LAT_FIFO_HALF_FULL = 25, // Number of clocks Quad3 read latency fifo is at least half full, but not full
    GPUPE_QUAD3_RD_LAT_FIFO_FULL = 26, // Number of clocks Quad3 read latency fifo is full
    GPUPE_QUAD3_RD_LAT_FIFO_LT_16TH_FULL = 27, // Number of clocks Quad3 read latency fifo is less than 1/16th full, but not empty

    GPUPE_READ_STARVED_QUAD0 = 28, // Number of clocks Quad0 is starved waiting for data from MC
    GPUPE_READ_STARVED_QUAD1 = 29, // Number of clocks Quad1 is starved waiting for data from MC
    GPUPE_READ_STARVED_QUAD2 = 30, // Number of clocks Quad2 is starved waiting for data from MC
    GPUPE_READ_STARVED_QUAD3 = 31, // Number of clocks Quad3 is starved waiting for data from MC
    GPUPE_READ_STARVED = 32, // Number of clocks any quad is starved waiting for data from MC
    GPUPE_READ_STALLED_QUAD0 = 33, // Number of clocks Quad0 a read is stalled off by waiting for other quads to sync up
    GPUPE_READ_STALLED_QUAD1 = 34, // Number of clocks Quad1 a read is stalled off by waiting for other quads to sync up
    GPUPE_READ_STALLED_QUAD2 = 35, // Number of clocks Quad2 a read is stalled off by waiting for other quads to sync up
    GPUPE_READ_STALLED_QUAD3 = 36, // Number of clocks Quad3 a read is stalled off by waiting for other quads to sync up
    GPUPE_READ_STALLED = 37, // Number of clocks a read is stalled off by waiting for other quads to sync up
    GPUPE_VALID_READ_QUAD0 = 38, // Valid cycles of cache reads on Quad0
    GPUPE_VALID_READ_QUAD1 = 39, // Valid cycles of cache reads on Quad1
    GPUPE_VALID_READ_QUAD2 = 40, // Valid cycles of cache reads on Quad2
    GPUPE_VALID_READ_QUAD3 = 41, // Valid cycles of cache reads on Quad3
    GPUPE_TC_TP_STARVED_QUAD0 = 42, // Quad0 is waiting for another quad to be valid before sending to TP
    GPUPE_TC_TP_STARVED_QUAD1 = 43, // Quad1 is waiting for another quad to be valid before sending to TP
    GPUPE_TC_TP_STARVED_QUAD2 = 44, // Quad2 is waiting for another quad to be valid before sending to TP
    GPUPE_TC_TP_STARVED_QUAD3 = 45, // Quad3 is waiting for another quad to be valid before sending to TP
    GPUPE_TC_TP_STARVED = 46, // Some data is ready for the TP, but stalled waiting for the rest
} GPUPERFEVENT_TCM;  // Texture Cache Memory


typedef enum
{
    // TPC Walker counters. The Walker causes the TPs to loop, for mip levels, for aniso,
    // and for volume slices, to generate all the addresses needed for all their samples.

    GPUPE_VALID_CYCLES  = 0,        // Cycles the TPC Walker is active. (Number of cycles the walker fifo is not empty, and the aligner fifo is not full.)
    GPUPE_SINGLE_PHASES  = 1,       // cycles the walker is processing plain point/bilin fetches
    GPUPE_ANISO_PHASES  = 2,        // aniso, mip aniso, vol aniso, mip vol aniso
    GPUPE_MIP_PHASES  = 3,          // mip, mip aniso, mip vol, mip vol aniso
    GPUPE_VOL_PHASES  = 4,          // vol, mip vol, vol aniso, mip vol aniso
    GPUPE_MIP_VOL_PHASES  = 5,      // just mip vol
    GPUPE_MIP_ANISO_PHASES  = 6,    // just mip aniso
    GPUPE_VOL_ANISO_PHASES  = 7,    // just vol aniso
    GPUPE_ANISO_2_1_PHASES  = 8,    // cycles spent on 2:1 aniso
    GPUPE_ANISO_4_1_PHASES  = 9,    // cycles spent on 4:1 aniso
    GPUPE_ANISO_6_1_PHASES  = 10,   // cycles spent on 6:1 aniso
    GPUPE_ANISO_8_1_PHASES  = 11,   // cycles spent on 8:1 aniso
    GPUPE_ANISO_10_1_PHASES  = 12,  // cycles spent on 10:1 aniso
    GPUPE_ANISO_12_1_PHASES  = 13,  // cycles spent on 12:1 aniso
    GPUPE_ANISO_14_1_PHASES  = 14,  // cycles spent on 14:1 aniso
    GPUPE_ANISO_16_1_PHASES  = 15,  // cycles spent on 16:1 aniso
    GPUPE_MIP_VOL_ANISO_PHASES  = 16, // mip vol aniso

    // TPC Aligner counters:

    GPUPE_ALIGN_2_PHASES  = 17,
    GPUPE_ALIGN_4_PHASES  = 18,

    // TPC counters. TPC communicates between the four TPs, and sends common data
    // to the TC (mainly TCA). It is responsible for making the TPs talk with TC
    // at the same time, even when their fetches break down into different
    // numbers of samples, due to aniso/mip differences.

    GPUPE_TPC_BUSY  = 19,
    GPUPE_TPC_STALLED  = 20,
    GPUPE_TPC_STARVED  = 21,
    GPUPE_TPC_WORKING  = 22,
    GPUPE_TPC_WALKER_BUSY  = 23,
    GPUPE_TPC_WALKER_STALLED  = 24,
    GPUPE_TPC_WALKER_WORKING  = 25,
    GPUPE_TPC_ALIGNER_BUSY  = 26,
    GPUPE_TPC_ALIGNER_STALLED  = 27,
    GPUPE_TPC_ALIGNER_STALLED_BY_BLEND  = 28,
    GPUPE_TPC_ALIGNER_STALLED_BY_CACHE  = 29,
    GPUPE_TPC_ALIGNER_WORKING  = 30,
    GPUPE_TPC_BLEND_BUSY  = 31,
    GPUPE_TPC_BLEND_SYNC  = 32,
    GPUPE_TPC_BLEND_STARVED  = 33,
    GPUPE_TPC_BLEND_WORKING  = 34,

    // TPC Opcode counters: Incremented once per instruction per 64-vector

    GPUPE_OPCODE_0X00  = 35, // num Vfetches with UseTextureCache=true
    GPUPE_OPCODE_0X01  = 36, // num Tfetches executed
    GPUPE_OPCODE_0X04  = 37, // unused, always 0
    GPUPE_OPCODE_0X10  = 38, // num getBCF ops executed
    GPUPE_OPCODE_0X11  = 39, // num getCompTexLOD ops executed
    GPUPE_OPCODE_0X12  = 40, // num GetGradients ops executed
    GPUPE_OPCODE_0X13  = 41, // num getWeights ops executed
    GPUPE_OPCODE_0X18  = 42, // num setTexLOD ops executed
    GPUPE_OPCODE_0X19  = 43, // num setGradientsH ops executed
    GPUPE_OPCODE_0X1A  = 44, // num setGradientsV ops executed
    GPUPE_OPCODE_OTHER  = 45, // unused, always 0

    // Always zero:

    GPUPE_RESERVED_46  = 46,
    GPUPE_RESERVED_47  = 47,
    GPUPE_RESERVED_48  = 48,
    GPUPE_RESERVED_49  = 49,
    GPUPE_RESERVED_50  = 50,
    GPUPE_RESERVED_51  = 51,
    GPUPE_RESERVED_52  = 52,
    GPUPE_RESERVED_53  = 53,
    GPUPE_RESERVED_54  = 54,
    GPUPE_RESERVED_55  = 55,

    // TP/TPC -> TCA fifos. The TCA receives sample requests from TPC and the
    // 4 TPs, and breaks apart requests which cannot be sent to the Memory
    // Hub in one cycle. TCA is split into 4 quarters, one per TP.
    // One fifo entry contains a quad's worth of samples.

    GPUPE_IN_FIFO_0_EMPTY  = 56,
    GPUPE_IN_FIFO_0_LT_HALF_FULL  = 57,
    GPUPE_IN_FIFO_0_HALF_FULL  = 58,
    GPUPE_IN_FIFO_0_FULL  = 59,
    GPUPE_IN_FIFO_1_EMPTY  = 60,
    GPUPE_IN_FIFO_1_LT_HALF_FULL  = 61,
    GPUPE_IN_FIFO_1_HALF_FULL  = 62,
    GPUPE_IN_FIFO_1_FULL  = 63,
    GPUPE_IN_FIFO_2_EMPTY  = 64,
    GPUPE_IN_FIFO_2_LT_HALF_FULL  = 65,
    GPUPE_IN_FIFO_2_HALF_FULL  = 66,
    GPUPE_IN_FIFO_2_FULL  = 67,
    GPUPE_IN_FIFO_3_EMPTY  = 68,
    GPUPE_IN_FIFO_3_LT_HALF_FULL  = 69,
    GPUPE_IN_FIFO_3_HALF_FULL  = 70,
    GPUPE_IN_FIFO_3_FULL  = 71,
    GPUPE_IN_FIFO_TPC_EMPTY  = 72,
    GPUPE_IN_FIFO_TPC_LT_HALF_FULL  = 73,
    GPUPE_IN_FIFO_TPC_HALF_FULL  = 74,
    GPUPE_IN_FIFO_TPC_FULL  = 75,

    // Other TCA counters:

    GPUPE_TPC_TC_XFC  = 76,     // TPC_TC_rts
    GPUPE_TPC_TC_STATE  = 77,   // tca_state_rts
    GPUPE_TC_STALL  = 78,       // Num cycles any of the TP->TCA fifos was full
    GPUPE_QUAD0_TAPS  = 79,     // u0TCA_PM_tap_valid_count
    GPUPE_QUAD1_TAPS  = 80,     // u1TCA_PM_tap_valid_count
    GPUPE_QUAD2_TAPS  = 81,     // u2TCA_PM_tap_valid_count
    GPUPE_QUAD3_TAPS  = 82,     // u3TCA_PM_tap_valid_count
    GPUPE_QUADS  = 83,          // tca_quad_valid_count

    GPUPE_TCA_SYNC_STALL  = 84, // Num cycles that 1+ TP->TCA fifos had data ready, but (not all fifos had data ready, or, some but not all of the TCA Probe Filters stalled)
    GPUPE_TAG_STALL  = 85,      // Number of probe-filter tag stalls (sum from all 4 Probe Filter units).
    GPUPE_SLICE_STALL  = 86,    // Number of probe-filter slice stalls (sum from all 4 Probe Filter units).
    GPUPE_SKEW_STALL  = 87,     // No probe filter is allowed to get > 3 cycles ahead of another. SKEW_STALL is number of stalls introduced to prevent this. (sum from all 4 Probe Filter units)
    GPUPE_TCB_SYNC_STALL  = 88, // Num cycles that 1+ TCA probe filters had data ready for TCB, but 1+ didn't have data ready.

    // TCB (Tag Compare Block) Core counters:

    GPUPE_TCA_VALID  = 89,      // num cycles TCA sent data to TCB
    GPUPE_PROBES_VALID  = 90,   // num probes sent from TCA to TCB

    // MISS_STALL: Could be called the "cache thrash" counter:
    // Number of cycles the TCB & TCA were stalled because a cache miss occurred,
    // but no cache lines were free (meaning, all cache lines are allocated, waiting for
    // MH to return data, or for TCO to finish reading all data out of the cache lines)

    GPUPE_MISS_STALL  = 91,
    GPUPE_FETCH_FIFO_STALL= 92, // num cycles the TCB Fetch Fifo was full.
    GPUPE_TCO_STALL  = 93,      // num cycles TCO wasn't ready for TCB to send it data
    GPUPE_ANY_STALL  = 94,      // = GPUPE_MISS_STALL + GPUPE_FETCH_FIFO_STALL + GPUPE_TCO_STALL

    // Tag Compare counters:
    // Each tag is built from several samples entering the TCB at the same time.
    // One tag is generated per cache line.

    GPUPE_TAG_MISSES  = 95,     // Requested cache line was not in the cache.
    GPUPE_TAG_HITS  = 96,       // Requested Cache line was in the cache.
    GPUPE_SUB_TAG_MISSES  = 97, // (Subset of TAG_HITS): Line was in the cache, but at least some of the data wasn't.

    GPUPE_SET0_INVALIDATES  = 98,
    GPUPE_SET1_INVALIDATES  = 99,
    GPUPE_SET2_INVALIDATES  = 100,
    GPUPE_SET3_INVALIDATES  = 101,
    GPUPE_SET0_TAG_MISSES  = 102,
    GPUPE_SET1_TAG_MISSES  = 103,
    GPUPE_SET2_TAG_MISSES  = 104,
    GPUPE_SET3_TAG_MISSES  = 105,
    GPUPE_SET0_TAG_HITS  = 106,
    GPUPE_SET1_TAG_HITS  = 107,
    GPUPE_SET2_TAG_HITS  = 108,
    GPUPE_SET3_TAG_HITS  = 109,
    GPUPE_SET0_SUB_TAG_MISSES  = 110,
    GPUPE_SET1_SUB_TAG_MISSES  = 111,
    GPUPE_SET2_SUB_TAG_MISSES  = 112,
    GPUPE_SET3_SUB_TAG_MISSES  = 113,

    GPUPE_SET0_EVICT1  = 114,
    GPUPE_SET0_EVICT2  = 115,
    GPUPE_SET0_EVICT3  = 116,
    GPUPE_SET0_EVICT4  = 117,
    GPUPE_SET0_EVICT5  = 118,
    GPUPE_SET0_EVICT6  = 119,
    GPUPE_SET0_EVICT7  = 120,
    GPUPE_SET0_EVICT8  = 121,
    GPUPE_SET0_EVICT9  = 122,
    GPUPE_SET0_EVICT10  = 123,
    GPUPE_SET0_EVICT11  = 124,
    GPUPE_SET0_EVICT12  = 125,
    GPUPE_SET0_EVICT13  = 126,
    GPUPE_SET0_EVICT14  = 127,
    GPUPE_SET0_EVICT15  = 128,
    GPUPE_SET0_EVICT16  = 129,
    GPUPE_SET1_EVICT1  = 130,
    GPUPE_SET1_EVICT2  = 131,
    GPUPE_SET1_EVICT3  = 132,
    GPUPE_SET1_EVICT4  = 133,
    GPUPE_SET1_EVICT5  = 134,
    GPUPE_SET1_EVICT6  = 135,
    GPUPE_SET1_EVICT7  = 136,
    GPUPE_SET1_EVICT8  = 137,
    GPUPE_SET1_EVICT9  = 138,
    GPUPE_SET1_EVICT10  = 139,
    GPUPE_SET1_EVICT11  = 140,
    GPUPE_SET1_EVICT12  = 141,
    GPUPE_SET1_EVICT13  = 142,
    GPUPE_SET1_EVICT14  = 143,
    GPUPE_SET1_EVICT15  = 144,
    GPUPE_SET1_EVICT16  = 145,
    GPUPE_SET2_EVICT1  = 146,
    GPUPE_SET2_EVICT2  = 147,
    GPUPE_SET2_EVICT3  = 148,
    GPUPE_SET2_EVICT4  = 149,
    GPUPE_SET2_EVICT5  = 150,
    GPUPE_SET2_EVICT6  = 151,
    GPUPE_SET2_EVICT7  = 152,
    GPUPE_SET2_EVICT8  = 153,
    GPUPE_SET2_EVICT9  = 154,
    GPUPE_SET2_EVICT10  = 155,
    GPUPE_SET2_EVICT11  = 156,
    GPUPE_SET2_EVICT12  = 157,
    GPUPE_SET2_EVICT13  = 158,
    GPUPE_SET2_EVICT14  = 159,
    GPUPE_SET2_EVICT15  = 160,
    GPUPE_SET2_EVICT16  = 161,
    GPUPE_SET3_EVICT1  = 162,
    GPUPE_SET3_EVICT2  = 163,
    GPUPE_SET3_EVICT3  = 164,
    GPUPE_SET3_EVICT4  = 165,
    GPUPE_SET3_EVICT5  = 166,
    GPUPE_SET3_EVICT6  = 167,
    GPUPE_SET3_EVICT7  = 168,
    GPUPE_SET3_EVICT8  = 169,
    GPUPE_SET3_EVICT9  = 170,
    GPUPE_SET3_EVICT10  = 171,
    GPUPE_SET3_EVICT11  = 172,
    GPUPE_SET3_EVICT12  = 173,
    GPUPE_SET3_EVICT13  = 174,
    GPUPE_SET3_EVICT14  = 175,
    GPUPE_SET3_EVICT15  = 176,
    GPUPE_SET3_EVICT16  = 177,

    // TCB fetch fifo, 8-deep

    GPUPE_FF_EMPTY  = 178,
    GPUPE_FF_LT_HALF_FULL  = 179,
    GPUPE_FF_HALF_FULL  = 180,
    GPUPE_FF_FULL  = 181,

    GPUPE_FF_XFC  = 182,
    GPUPE_FF_STALLED  = 183, // TCB_PM_fetch_fifo_stalled

    // TCB fetch generator

    GPUPE_FG_MASKS  = 184,
    GPUPE_FG_LEFT_MASKS  = 185,
    GPUPE_FG_LEFT_MASK_STALLED  = 186,
    GPUPE_FG_LEFT_NOT_DONE_STALL  = 187,
    GPUPE_FG_LEFT_FG_STALL  = 188,
    GPUPE_FG_LEFT_SECTORS  = 189,
    GPUPE_FG_RIGHT_MASKS  = 190,
    GPUPE_FG_RIGHT_MASK_STALLED  = 191,
    GPUPE_FG_RIGHT_NOT_DONE_STALL  = 192,
    GPUPE_FG_RIGHT_FG_STALL  = 193,
    GPUPE_FG_RIGHT_SECTORS  = 194,
    GPUPE_FG0_REQUESTS  = 195,
    GPUPE_FG0_STALLED  = 196,
    GPUPE_FG1_REQUESTS  = 197,
    GPUPE_FG1_STALLED  = 198,

    // TCB MH Interface

    GPUPE_MEM_REQ512  = 199,
    GPUPE_MEM_REQ_SENT  = 200,
    GPUPE_MEM_AGP_READ_REQ  = 201,
    GPUPE_MEM_LOCAL_READ_REQ  = 202,
    GPUPE_TC0_MH_STALLED  = 203,
    GPUPE_TC1_MH_STALLED  = 204,
} GPUPERFEVENT_TCF;  // Texture Cache Fetch

typedef enum
{
    GPUPE_POINT_QUADS = 0, // Any Point sampled quads (includes mip, aniso, volume)
    GPUPE_BILIN_QUADS = 1, // Any Bilinearly filtered quads  (includes mip, aniso, volume)
    GPUPE_ANISO_QUADS = 2, // Any Aniso (>1:1) filtered quads (includes mip and/or vol)
    GPUPE_MIP_QUADS = 3, // Any mip filtered quads (includes aniso and/or vol)
    GPUPE_VOL_QUADS = 4, // Any volume filtered quads (include mip and/or aniso)
    GPUPE_MIP_VOL_QUADS = 5, // Mip and volume filtered quads (not aniso)
    GPUPE_MIP_ANISO_QUADS = 6, // Mip and aniso (>1:1) filtered quads (not vol)
    GPUPE_VOL_ANISO_QUADS = 7, // Volume and aniso (>1:1) filtered quads (not mip)
    GPUPE_ANISO_2_1_QUADS = 8, // Any quads with 2:1 anisotropic filtering
    GPUPE_ANISO_4_1_QUADS = 9, // Any quads with 4:1 anisotropic filtering
    GPUPE_ANISO_6_1_QUADS = 10, // Any quads with 6:1 anisotropic filtering
    GPUPE_ANISO_8_1_QUADS = 11, // Any quads with 8:1 anisotropic filtering
    GPUPE_ANISO_10_1_QUADS = 12, // Any quads with 10:1 anisotropic filtering
    GPUPE_ANISO_12_1_QUADS = 13, // Any quads with 12:1 anisotropic filtering
    GPUPE_ANISO_14_1_QUADS = 14, // Any quads with 14:1 anisotropic filtering
    GPUPE_ANISO_16_1_QUADS = 15, // Any quads with 16:1 anisotropic filtering
    GPUPE_MIP_VOL_ANISO_QUADS = 16, // Mip, volume and aniso (>1:1) filtered quads
    GPUPE_ALIGN_2_QUADS = 17, // 2-cycle misaligned quads
    GPUPE_ALIGN_4_QUADS = 18, // 4-cycle misaligned quads
    GPUPE_PIX_0_QUAD = 19, // No valid pixels in quad
    GPUPE_PIX_1_QUAD = 20, // 1 valid pixel in quad
    GPUPE_PIX_2_QUAD = 21, // 2 valid pixels in quad
    GPUPE_PIX_3_QUAD = 22, // 3 valid pixels in quad
    GPUPE_PIX_4_QUAD = 23, // 4 valid pixels in quad
    GPUPE_TP_MIPMAP_LOD0 = 24, // MipMap LOD 0
    GPUPE_TP_MIPMAP_LOD1 = 25, // MipMap LOD 1
    GPUPE_TP_MIPMAP_LOD2 = 26, // MipMap LOD 2
    GPUPE_TP_MIPMAP_LOD3 = 27, // MipMap LOD 3
    GPUPE_TP_MIPMAP_LOD4 = 28, // MipMap LOD 4
    GPUPE_TP_MIPMAP_LOD5 = 29, // MipMap LOD 5
    GPUPE_TP_MIPMAP_LOD6 = 30, // MipMap LOD 6
    GPUPE_TP_MIPMAP_LOD7 = 31, // MipMap LOD 7
    GPUPE_TP_MIPMAP_LOD8 = 32, // MipMap LOD 8
    GPUPE_TP_MIPMAP_LOD9 = 33, // MipMap LOD 9
    GPUPE_TP_MIPMAP_LOD10 = 34, // MipMap LOD 10
    GPUPE_TP_MIPMAP_LOD11 = 35, // MipMap LOD 11
    GPUPE_TP_MIPMAP_LOD12 = 36, // MipMap LOD 12
    GPUPE_TP_MIPMAP_LOD13 = 37, // MipMap LOD 13
    GPUPE_TP_MIPMAP_LOD14 = 38, // MipMap LOD 14
} GPUPERFEVENT_TP;

typedef enum
{
    GPUPE_SX_SC_QUADS = 0, // Number of quads sent by the SC
    GPUPE_SX_SC_QUAD_FIFO_FULL = 1, // Number of cycles where the SC quad FIFO is full
    GPUPE_SX_EXPORT_VECTORS = 2, // Number of exported vectors
    GPUPE_SX_DUMMY_QUADS = 3, // Number of dummy quads
    GPUPE_SX_ALPHA_FAIL = 4, // Number of pixels that fail alpha test
    GPUPE_SX_RB_QUAD_BUSY = 5, // SX sending quads to the RBs
    GPUPE_SX_RB_COLOR_BUSY = 6, // SX sending colors to the RBs
    GPUPE_SX_RB_QUAD_STALL = 7, // SX idle on the quad interface
    GPUPE_SX_RB_COLOR_STALL = 8, // SX idle on the color interface
} GPUPERFEVENT_SX;

typedef enum
{
    GPUPE_BC_CNTX0_BUSY = 0, // Number of cycles BC is busy processing data(bc_context0_busy)
    GPUPE_BC_CNTX17_BUSY = 1, // Number of cycles BC is busy processing data(bc_context17_busy)
    GPUPE_BC_RQ_STALLED = 2, // Number of cycles BC is stalled due to no available banks for reorder queue to select
    GPUPE_BC_AZ_STALLED = 3, // Number of cycles BC is stalled by AZ
    GPUPE_BC_MH_CPY_STALLED = 4, // Number of cycles BC is stalled by MH for copy/resolve
    GPUPE_BC_MH_EXP_STALLED = 5, // Number of cycles BC is stalled by MH for memory exports
    GPUPE_BC_SC_STARVED = 6, // Number of cycles SC->BC quad fifo is empty and BC has a tile from HZ to work on
    GPUPE_BC_SX_STARVED = 7, // Number of cycles SX->BC quad fifo is empty and we're in color mode and BC has a tile and quads from HZ and SC
    GPUPE_BC_ACC_COUNT = 8, // Number of times multiple fragments are combined into a quad (acc opcodes)
    GPUPE_BC_DRAW_COUNT = 9, // Number of quads sent to AZ (draw opcodes)
    GPUPE_BC_ACC2_COUNT = 10, // Number of accumulate two opcodes for 64bpp and MRT's
    GPUPE_BC_DRAW2_COUNT = 11, // Number of draw two opcodes for 64bpp and MRT's
    GPUPE_BC_SETZ_COUNT = 12, // Number of depth exports
    GPUPE_BC_READ_COUNT = 13, // Number of read opcodes when resolving
    GPUPE_BC_READ_ACC_COUNT = 14, // Number of read_acc opcodes when resolving
    GPUPE_BC_STATE_COUNT = 15, // Number of state opcodes sent to AZ
    GPUPE_BC_STATE2_COUNT = 16, // Number of state2 opcodes sent to AZ
    GPUPE_BC_COPY_WRITE_COUNT = 17, // Number of 256-bit system memory writes for EDRAM copy/resolve
    GPUPE_BC_EXPORT_COUNT = 18, // Number of memory exports from SX
} GPUPERFEVENT_BC;

typedef enum
{
    GPUPE_RANK_BANK0_ACCESSES = 0, // Rank Bank 0 access event
    GPUPE_RANK_BANK1_ACCESSES = 1, // Rank Bank 1 access event
    GPUPE_RANK_BANK2_ACCESSES = 2, // Rank Bank 2 access event
    GPUPE_RANK_BANK3_ACCESSES = 3, // Rank Bank 3 access event
    GPUPE_RANK_BANK4_ACCESSES = 4, // Rank Bank 4 access event
    GPUPE_RANK_BANK5_ACCESSES = 5, // Rank Bank 5 access event
    GPUPE_RANK_BANK6_ACCESSES = 6, // Rank Bank 6 access event
    GPUPE_RANK_BANK7_ACCESSES = 7, // Rank Bank 7 access event
    GPUPE_RANK_BANK8_ACCESSES = 8, // Rank Bank 8 access event
    GPUPE_RANK_BANK9_ACCESSES = 9, // Rank Bank 9 access event
    GPUPE_RANK_BANK10_ACCESSES = 10, // Rank Bank 10 access event
    GPUPE_RANK_BANK11_ACCESSES = 11, // Rank Bank 11 access event
    GPUPE_RANK_BANK12_ACCESSES = 12, // Rank Bank 12 access event
    GPUPE_RANK_BANK13_ACCESSES = 13, // Rank Bank 13 access event
    GPUPE_RANK_BANK14_ACCESSES = 14, // Rank Bank 14 access event
    GPUPE_RANK_BANK15_ACCESSES = 15, // Rank Bank 15 access event
    GPUPE_READ_2_WRITE = 16, // Read to Write transition event
    GPUPE_WRITE_2_READ = 17, // Write to Read transition event
    GPUPE_NEW_PAGE_ACCESSES = 18, // Number of new page accesses out of the ordering engine
    GPUPE_TOTAL_ACCESSES = 19, // Total accesses out of the ordering engine
    GPUPE_READ_ACCESSES = 20, // Number of reads out of the ordering engine
    GPUPE_ACCESS_PRESENT_NO_ISSUE_CLKS = 21, // Number of clocks in which an access is present but ordering engine doesn't issue
    GPUPE_CMD_PRESENT_NO_XFER_CLKS = 22, // Number of non-transfer clocks on DRAM data bus when commands are present
    GPUPE_URGENT_DC_ACCESSES = 23, // Number of urgent accesses from DC queue
    GPUPE_URGENT_SB_ACCESSES = 24, // Number of urgent accesses from SB queue
    GPUPE_URGENT_BIU_ACCESSES = 25, // Number of urgent accesses from BIUS (slow) queue
    GPUPE_NEW_BIUF_ACCESSES = 26, // Number of new accesses from BIUF (fast) queue
    GPUPE_NEW_CP_ACCESSES = 27, // Number of new accesses from CP queue
    GPUPE_NEW_TC_ACCESSES = 28, // Number of new accesses from TC queue
    GPUPE_NEW_VC_ACCESSES = 29, // Number of new accesses from VC queue
    GPUPE_NEW_BC_CP_ACCESSES = 30, // Number of new accesses from BC_CP queue
    GPUPE_NEW_BC_EX_ACCESSES = 31, // Number of new accesses from BC_EX queue
    GPUPE_NEW_VGT_ACCESSES = 32, // Number of new accesses from VGT queue
    GPUPE_NEW_DC_ACCESSES = 33, // Number of new accesses from DC queue
    GPUPE_NEW_SB_ACCESSES = 34, // Number of new accesses from SB queue
    GPUPE_NEW_BIUS_ACCESSES = 35, // Number of new accesses from BIUS (slow) queue
    GPUPE_BIUS_READ_ACCESSES = 36, // Number of Read accesses from BIUS (slow) queue
    GPUPE_SB_READ_ACCESSES = 37, // Number of Read accesses from SB queue
    GPUPE_CP_READ_ACCESSES = 38, // Number of Read accesses from CP queue

    // For the following counters, when writing GPUPERFREG_MC*_PERFCOUNTER0_SELECT,
    // "N" is the 2nd lowest byte written to the _SELECT register. For instance,
    // to read MC0's GPUPE_NTH_SMPG_ACCESS_IS_TC counter with N=1, 
    // use GPUPE_NTH_SMPG_ACCESS_IS_TC | (0x01 << 8))  for GPUPERFREG_MC0_PERFCOUNTER0_SELECT's selection 

    GPUPE_NTH_SMPG_ACCESS_IS_BIUF = 39, // Number of times the Nth access in a same page sequence is from BIUF (fast) queue
    GPUPE_NTH_SMPG_ACCESS_IS_CP = 40, // Number of times the Nth access in a same page sequence is from CP queue
    GPUPE_NTH_SMPG_ACCESS_IS_TC = 41, // Number of times the Nth access in a same page sequence is from TC queue
    GPUPE_NTH_SMPG_ACCESS_IS_VC = 42, // Number of times the Nth access in a same page sequence is from VC queue
    GPUPE_NTH_SMPG_ACCESS_IS_BC_CP = 43, // Number of times the Nth access in a same page sequence is from BC_CP queue
    GPUPE_NTH_SMPG_ACCESS_IS_BC_EX = 44, // Number of times the Nth access in a same page sequence is from BC_EX queue
    GPUPE_NTH_SMPG_ACCESS_IS_VGT = 45, // Number of times the Nth access in a same page sequence is from VGT queue
    GPUPE_NTH_SMPG_ACCESS_IS_DC = 46, // Number of times the Nth access in a same page sequence is from DC queue
    GPUPE_NTH_SMPG_ACCESS_IS_SB = 47, // Number of times the Nth access in a same page sequence is from SB queue
    GPUPE_NTH_SMPG_ACCESS_IS_BIUS = 48, // Number of times the Nth access in a same page sequence is from BIUS (slow) queue
    GPUPE_N_VALID_ENTRY_IN_TEXTURE_CAM_CLKS = 49, // Number of clocks where there are N valid entries in the texture cam
    GPUPE_N_VALID_ENTRY_IN_VC_CAM_CLKS = 50, // Number of clocks where there are N valid entries in the VC cam
    GPUPE_N_VALID_ENTRY_IN_BIUS_CAM_CLKS = 51, // Number of clocks where there are N valid entries in the BIUS cam
    GPUPE_N_SB_BUF_USED_CLKS = 52, // Number of clocks when q_rdbuf_sb_buffers_used = N
    GPUPE_N_TC_BUF_USED_CLKS = 53, // Number of clocks when q_rdbuf_tc_buffers_used = N
    GPUPE_N_VC_BUF_USED_CLKS = 54, // Number of clocks when q_rdbuf_vc_buffers_used = N
    GPUPE_N_DC_BUF_USED_CLKS = 55, // Number of clocks when q_rdbuf_dc_buffers_used = N
    GPUPE_N_VGT_BUF_USED_CLK = 56, // Number of clocks when q_rdbuf_vgt_buffers_used = N
    GPUPE_N_CP_BUF_USED_CLKS = 57, // Number of clocks when q_rdbuf_cp_buffers_used = N
} GPUPERFEVENT_MC;

typedef enum
{
    GPUPE_CP_READ_MEMORY = 0,        // counts cp read requests that maps to the main memory aperture
    GPUPE_CP_READ_PGLB  = 1,         // counts cp read requests that maps to the pg line buffer aperture
    GPUPE_CP_WRITE_MEMORY  = 2,      // counts cp write requests that maps to the main memory aperture
    GPUPE_CP_WRITE_SNOOPED  = 3,     // counts cp write requests that is marked as snooped
    GPUPE_CP_WRITE_WRITEBACK  = 4,   // counts cp write requests that maps to the writeback aperture
    GPUPE_MH_CP_RTR  = 5,            // rtr from mh to cp, counts cycles where it is asserted - able to accept requests
    GPUPE_VGT_READ_MEMORY  = 6,      // counts vgt read requests that maps to the main memory aperture
    GPUPE_VGT_READ_PGLB  = 7,        // counts vgt read requests that maps to the pg line buffer aperture
    GPUPE_MH_VGT_RTR  = 8,           // rtr from mh to vgt, counts cycles where it is asserted - able to accept requests
    GPUPE_IOC_READ  = 9,             // counts ioc read requests
    GPUPE_IOC_WRITE  = 10,           // counts ioc write requests
    GPUPE_IOC_READ_BYTE_COUNT  = 11, // counts the number of bytes in ioc read requests
    GPUPE_IOC_WRITE_BYTE_COUNT  = 12,// counts the number of bytes in ioc writes requests
    GPUPE_IOC_URGENT  = 13,          // counts clock cycles that ioc asserts urgent
    GPUPE_MH_IOC_RTR  = 14,          // counts clock cycles where the mh is ready to recieve requests from the ioc
    GPUPE_TC0_READ  = 15,            // counts read requests
    GPUPE_MH_TC0_RTR  = 16,          // counts cycles where mh is ready
    GPUPE_TC1_READ  = 17,            // counts read requests
    GPUPE_MH_TC1_RTR  = 18,          // counts cycles where mh is ready
    GPUPE_VC0_READ_MEMORY  = 19,     // counts read requests to memory
    GPUPE_VC0_READ_PGLB  = 20,       // counts read requests to pglb
    GPUPE_MH_VC0_RTR  = 21,          // counts cycles where mh is ready
    GPUPE_VC1_READ_MEMORY  = 22,     // counts read requests to memory
    GPUPE_VC1_READ_PGLB  = 23,       // counts read requests to pglb
    GPUPE_MH_VC1_RTR  = 24,          // counts cycles where mh is ready
    GPUPE_BC0_CP_WRITE  = 25,        // counts copy write requests
    GPUPE_BC0_EX_WRITE  = 26,        // counts export write requests
    GPUPE_MH_BC0_RTR  = 27,          //
    GPUPE_BC1_CP_WRITE  = 28,        // counts copy write requests
    GPUPE_BC1_EX_WRITE  = 29,        // counts export write requests
    GPUPE_MH_BC1_RTR  = 30,          //
    GPUPE_DC_GRAPHICS_REQ  = 31,     // counts graphics read requests
    GPUPE_DC_OVERLAY_REQ  = 32,      // counts overlay read requests
    GPUPE_DC_URGENT  = 33,           // counts cycles urgent is asserted
    GPUPE_PGLB_BIU_REQ  = 34,        // counts requests from the pglb to the biu
    GPUPE_BIU_PGL_READ_DATA  = 35,   // counts cycles where read data is transfered from biu to pglb
    GPUPE_PGL_MHS_READ_DATA  = 36,   // pglb to mh switch data transfer cycles
    GPUPE_MH_MC0_READ_REQS  = 37,    // counts number of read request sent to mc0
    GPUPE_MH_MC0_WRITE_REQS  = 38,   // counts number of write requests sent to mc0
    GPUPE_MH_MC1_READ_REQS  = 39,    // counts number of read request sent to mc1
    GPUPE_MH_MC1_WRITE_REQS  = 40,   // counts number of write requests sent to mc1
    GPUPE_MC0_MH_READ_DATA  = 41,    // counts bytes returned to mh in increments of 32bytes from mc0
    GPUPE_MC1_MH_READ_DATA  = 42,    // counts bytes returned to mh in increments of 32bytes from mc1
    GPUPE_MH_CP_SEND  = 43,          // counts bytes returned to cp in increments of 4 bytes
    GPUPE_MH_VGT_SEND  = 44,         // counts bytes returned to vgt in increments of 16 bytes
    GPUPE_MH_IOC_SEND  = 45,         // counts bytes returned to ioc in increments of 4 bytes
    GPUPE_MH_TC0_SEND  = 46,         // counts bytes returned to tc0 in increments of 16 bytes
    GPUPE_MH_TC1_SEND  = 47,         // counts bytes returned to tc1 in increments of 16 bytes
    GPUPE_MH_VC0_SEND  = 48,         // counts bytes returned to vc0 in increments of 16 bytes
    GPUPE_MH_VC1_SEND  = 49,         // counts bytes returned to vc1 in increments of 16 bytes
    GPUPE_MH_DC_SEND  = 50,          // counts bytes returned to dc in increments of 16 bytes
    GPUPE_DC0_INFLIGHT_FULL  = 51,   // counts cycles where the inflight (or outstanding read request) queue was full
    GPUPE_DC1_INFLIGHT_FULL  = 52,   //
    GPUPE_VC0_INFLIGHT_FULL  = 53,   //
    GPUPE_VC1_INFLIGHT_FULL  = 54,   //
    GPUPE_TC0_INFLIGHT_FULL  = 55,   //
    GPUPE_TC1_INFLIGHT_FULL  = 56,   //
    GPUPE_CP0_INFLIGHT_FULL  = 57,   //
    GPUPE_CP1_INFLIGHT_FULL  = 58,   //
    GPUPE_VGT0_INFLIGHT_FULL  = 59,  //
    GPUPE_VGT1_INFLIGHT_FULL  = 60,  //
    GPUPE_SB0_INFLIGHT_FULL  = 61,   //
    GPUPE_SB1_INFLIGHT_FULL  = 62,   //
    GPUPE_VCPGL_INFLIGHT_FULL  = 63, //
    GPUPE_CPPGL_INFLIGHT_FULL  = 64, //
    GPUPE_VGTPGL_INFLIGHT_FULL  = 65,//
    GPUPE_MC0_DC_Q_FULL  = 66,       // counts cycles where the request queue in the mc was full
    GPUPE_MC0_VC_Q_FULL  = 67,       //
    GPUPE_MC0_TC_Q_FULL  = 68,       //
    GPUPE_MC0_CP_Q_FULL  = 69,       //
    GPUPE_MC0_SB_Q_FULL  = 70,       //
    GPUPE_MC0_VGT_Q_FULL  = 71,      //
    GPUPE_MC0_BCCP_Q_FULL  = 72,     //
    GPUPE_MC0_BCEX_Q_FULL  = 73,     //
    GPUPE_MC1_DC_Q_FULL  = 74,       //
    GPUPE_MC1_VC_Q_FULL  = 75,       //
    GPUPE_MC1_TC_Q_FULL  = 76,       //
    GPUPE_MC1_CP_Q_FULL  = 77,       //
    GPUPE_MC1_SB_Q_FULL  = 78,       //
    GPUPE_MC1_VGT_Q_FULL  = 79,      //
    GPUPE_MC1_BCCP_Q_FULL  = 80,     //
    GPUPE_MC1_BCEX_Q_FULL  = 81,     //
    GPUPE_CP_PGL_CACHE_HIT  = 82,    // counts pg line buffer cache hits
    GPUPE_VGT_PGL_CACHE_HIT  = 83,   //
    GPUPE_VC_PGL_CACHE_HIT  = 84,    //
    GPUPE_CP_PGL_FULL  = 85,         // analagous to the request queue in the mc`s - for pg reads
    GPUPE_VGT_PGL_FULL  = 86,        //
    GPUPE_VC0_PGL_FULL  = 87,        //
    GPUPE_VC1_PGL_FULL  = 88,        //
    GPUPE_MC0_WDB_FULL  = 89,        // counts cycles where the write data buffer for mc0 was full
    GPUPE_MC0_TAGBUF_FULL  = 90,     // counts cycles where the tag buffer for mc0 was full
    GPUPE_MC1_WDB_FULL  = 91,        //
    GPUPE_MC1_TAGBUF_FULL  = 92,     //
    GPUPE_PGL_TAGBUF_FULL  = 93,     // counts cycles where the tag buffer for pgl was full
    GPUPE_CP_WRITENOTIFY  = 94,      // counts write clean indications sent back to cp
    GPUPE_BC_WRITENOTIFY  = 95,      // counts write cleans sent back to bc
    GPUPE_IOC_SYNC  = 96,            // counts write cleans sent back to ioc

    // Latency counts increment by the number of pending requests of that type.
    // (Their derivative is incremented when a request is sent and decremented
    // when the data is actually retrieved).
    // Divide the counter by the number of requests of the specified type in order to
    // get the average latency.

    // The MH PGL latency counters can only be accessed through GPUPERFREG_MH_PERFCOUNTER2_SELECT

    GPUPE_PGL_BIU_LATENCY  = 97,

    // The MH MC0 counters can only be accessed using GPUPERFREG_MH_PERFCOUNTER0_SELECT

    GPUPE_MH_MC0_LATENCY  = 98,      //
    GPUPE_MH_MC0_DC_LATENCY  = 99,   //
    GPUPE_MH_MC0_VC_LATENCY  = 100,  //
    GPUPE_MH_MC0_TC_LATENCY  = 101,  //
    GPUPE_MH_MC0_CP_LATENCY  = 102,  //
    GPUPE_MH_MC0_SB_LATENCY  = 103,  //
    GPUPE_MH_MC0_VGT_LATENCY  = 104, //

    // The MH_MC1 counters can only be accessed using GPUPERFREG_MH_PERFCOUNTER1_SELECT

    GPUPE_MH_MC1_LATENCY  = 105,     //
    GPUPE_MH_MC1_DC_LATENCY  = 106,  //
    GPUPE_MH_MC1_VC_LATENCY  = 107,  //
    GPUPE_MH_MC1_TC_LATENCY  = 108,  //
    GPUPE_MH_MC1_CP_LATENCY  = 109,  //
    GPUPE_MH_MC1_SB_LATENCY  = 110,  //
    GPUPE_MH_MC1_VGT_LATENCY  = 111, //

    // The MH PGL latency counters can only be accessed through GPUPERFREG_MH_PERFCOUNTER2_SELECT

    GPUPE_MH_PGL_LATENCY  = 112,     //
    GPUPE_MH_PGL_CP_LATENCY  = 113,  //
    GPUPE_MH_PGL_VC_LATENCY  = 114,  //
    GPUPE_MH_PGL_VGT_LATENCY  = 115, //

    GPUPE_TC0_RDY_AND_NOROOM  = 116, // counts cycles where there was read data available to send to tc0 and there wasn`t room in tc0 to accept it
    GPUPE_TC1_RDY_AND_NOROOM  = 117, //
    GPUPE_IOC_RDY_AND_NOROOM  = 118, //
} GPUPERFEVENT_MH;

typedef enum
{
    GPUPE_0 = 0, // Always Count
    GPUPE_1 = 1, // RBBM_IF Fifo Full
    GPUPE_2 = 2, // MIOC Fifo Full
    GPUPE_3 = 3, // MIOC has a transaction, and is waiting for the RBBM to be ready
    GPUPE_4 = 4, // SIOC Fifo Full
    GPUPE_5 = 5, // SIOC has a transaction, and is waiting for the IOC to be ready
} GPUPERFEVENT_BIF;

typedef enum
{
    GPUPE_DMIF_PER_DCREQ_EVENT = 0,
    GPUPE_DMIF_PER_DCGRPH_REQ_EVENT = 1,
    GPUPE_DMIF_PER_DCOVL_REQ_EVENT = 2,
    GPUPE_DMIF_PER_DCREQ_SIZE_EVENT = 3,
    GPUPE_DMIF_PER_DCGRPH_REQ_SIZE_EVENT = 4,
    GPUPE_DMIF_PER_DCOVL_REQ_SIZE_EVENT = 5,
    GPUPE_DMIF_PER_DCSURFACE_UPDATE_EVENT = 6,
    GPUPE_DMIF_PER_DC_MH_REQ_EVENT = 7,
    GPUPE_DMIF_PER_DC_MH_D1GRPH_REQ_EVENT = 8,
    GPUPE_DMIF_PER_DC_MH_D1OVL_REQ_EVENT = 9,
    GPUPE_DMIF_PER_DC_MH_REQ_SIZE_EVENT = 10,
    GPUPE_DMIF_PER_DC_MH_D1GRPH_REQ_SIZE_EVENT = 11,
    GPUPE_DMIF_PER_DC_MH_D1OVL_REQ_SIZE_EVENT = 12,
    GPUPE_DMIF_PER_DC_MH_SURFACE_UPDATE_EVENT = 13,
    GPUPE_DMIF_PER_MH_DC_RTR_EVENT = 14,
    GPUPE_DMIF_PER_CMD_PROC_WAIT_RTR_STATE_EVENT = 15,
    GPUPE_DMIF_PER_CMD_PROC_IDLE_STATE_EVENT = 16,
    GPUPE_DMIF_PER_DC_MH_URGENT_EVENT = 17,
    GPUPE_DMIF_PER_MH_DC_SEND_EVENT = 18,
    GPUPE_DMIF_PER_MH_DC_SEND_D1GRPH_EVENT = 19,
    GPUPE_DMIF_PER_MH_DC_SEND_D1OVL_EVENT = 20,
    GPUPE_DMIF_PER_DC_MH_RTR_EVENT = 21,
    GPUPE_DMIF_PER_DMIF_BUSY_EVENT = 22,
    GPUPE_DMIF_PER_DMIF_BUSY_MH_DC_SEND_EVENT = 23,
    GPUPE_DMIF_PER_DMIF_BUSY_DC_MH_RTR_EVENT = 24,
    GPUPE_DMIF_PER_DMIF_DCSEND_EVENT = 25,
    GPUPE_DMIF_PER_DMIF_DCNOT_RTS_EVENT = 26,
    GPUPE_DMIF_PER_DCDMIF_NOT_RTR_EVENT = 27,
    GPUPE_DCP_PER_LUT_HOST_RW_EVENT = 28,
    GPUPE_DCP_PER_LUT_RW_BY_HOST_EVENT = 29,
    GPUPE_DCP_PER_RTR_LOW_BY_LUT_HOST_RW_EVENT = 30,
    GPUPE_DCCG_PER_SCLK_R_RBBMIF_CLOCK_ON_EVENT = 31,
    GPUPE_DCCG_PER_SCLK_R_DISCLOCK_ON_EVENT = 32,
    GPUPE_DCCG_PER_SCLK_G_SCL_CLOCK_ON_EVENT = 33,
    GPUPE_DCCG_PER_SCLK_G_DCCLOCK_ON_EVENT = 34,
    GPUPE_DCCG_PER_PCLK_CRTC_CLOCK_ON_EVENT = 35,
    GPUPE_DCCG_PER_DVOACLK_C_CLOCK_ON_EVENT = 36,
    GPUPE_DCCG_PER_DVOACLK_D_CLOCK_ON_EVENT = 37,
    GPUPE_CRTC1_PER_START_LINE_EVENT = 38,
    GPUPE_CRTC1_PER_HSYNC_A_EVENT = 39,
    GPUPE_CRTC1_PER_VSYNC_A_EVENT = 40,
    GPUPE_CRTC1_PER_H_DATA_ACTIVE_EVENT = 41,
    GPUPE_CRTC1_PER_V_DATA_ACTIVE_EVENT = 42,
    GPUPE_CRTC1_PER_DATA_ACTIVE_EVENT = 43,
    GPUPE_CRTC1_PER_H_BLANK_EVENT = 44,
    GPUPE_CRTC1_PER_V_BLANK_EVENT = 45,
    GPUPE_CRTC1_PER_BLANK_EVENT = 46,
    GPUPE_CRTC1_PER_INTERLACE_SELECT_EVENT = 47,
    GPUPE_CRTC1_PER_STEREO_SELECT_EVENT = 48,
    GPUPE_SCL1_PER_HOST_CONFLICT_EVENT = 49,
    GPUPE_SCL1_PER_ADVANCE_FILTER_POS_EVENT = 50,
    GPUPE_SCL1_PER_TAINC_EVENT = 51,
    GPUPE_SCL1_PER_REQUEST_EOL_EVENT = 52,
    GPUPE_SCL1_PER_V_COEF_PRELOAD_EVENT = 53,
    GPUPE_SCL1_PER_EOL_EVENT = 54,
    GPUPE_SCL1_PER_SOF_EVENT = 55,
    GPUPE_LB_PER_DISP1_RESET_REQ_EVENT = 56,
    GPUPE_LB_PER_DISP1_REQ_SEND_EVENT = 57,
    GPUPE_LB_PER_DISP1_REQ_UNDERFLOW_EVENT = 58,
    GPUPE_LB_PER_DISP1_DATA_UNDERFLOW_EVENT = 59,
    GPUPE_LB_PER_DISP1_URGENT_EVENT = 60,
    GPUPE_LB_PER_DISP1_VBLANK_STAT_EVENT = 61,
    GPUPE_LB_PER_DISP1_VLINE_STAT_EVENT = 62,
    GPUPE_DOUT_PER_SCL_DISP1_MODE_CHANGE_INTERRUPT_EVENT = 63,
    GPUPE_DOUT_PER_LB_D1_VLINE_INTERRUPT_EVENT = 64,
    GPUPE_DOUT_PER_LB_D1_VBLANK_INTERRUPT_EVENT = 65,
    GPUPE_DOUT_PER_DISTIMER_INTERRUPT_EVENT = 66,
    GPUPE_LOGIC1_EVENT = 256,
} DCPERFEVENT;

typedef enum
{
    BIUPE_RECV_REQUESTS = 0,
    BIUPE_RECV_RESPONSES = 1, // (pg data only),
    BIUPE_IOC_READS = 2,
    BIUPE_IOC_WRITES = 3,
    BIUPE_MEM_READS = 4,
    BIUPE_MEM_WRITES = 5,
    BIUPE_FSB_SYNCS = 6,
    BIUPE_EIEIOS = 7,
    BIUPE_EOIS = 8,
    BIUPE_FLUSH_ACKS = 9,
    BIUPE_REQUEST_READ_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 10,
    BIUPE_REQUEST_READ_DATA_BYTES_16_BYTE_AND_MORE_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 11,
    BIUPE_REQUEST_WRITE_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 12,
    BIUPE_REQUEST_WRITE_DATA_BYTES_16_BYTE_AND_MORE_TRANSFER_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 13,
    BIUPE_RESERVED0 = 14,
    BIUPE_XMIT_REQUESTS = 15,
    BIUPE_XMIT_RESPONSES = 16,
    BIUPE_READ128S = 17,
    BIUPE_FLUSH_REQ_READS = 18,
    BIUPE_FLUSH_REQ_WRITES = 19,
    BIUPE_CPUWB = 20,
    BIUPE_INTERRUPTS = 21,
    BIUPE_RESPONSES_MC0 = 22,
    BIUPE_RESPONSES_MC1 = 23,
    BIUPE_RESPONSES_IOC_MST = 24,
    BIUPE_MC0_RESPONSE_READ_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 25,
    BIUPE_MC0_RESPONSE_READ_DATA_BYTES_16_BYTE_AND_MORE_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 26,
    BIUPE_MC1_RESPONSE_READ_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 27,
    BIUPE_MC1_RESPONSE_READ_DATA_BYTES_16_BYTE_AND_MORE_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 28,
    BIUPE_IOC_MST_RESPONSE_READ_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 29,
    BIUPE_IOC_MST_RESPONSE_READ_DATA_BYTES_16_BYTE_AND_MORE_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 30,
    BIUPE_RESERVED1A = 31,
    BIUPE_RESERVED2A = 32,
    BIUPE_RESERVED3A = 33,
    BIUPE_RESERVED4A = 34,
    BIUPE_RESERVED5A = 35,
    BIUPE_RESERVED6A = 36,
    BIUPE_RESERVED7A = 37,
    BIUPE_RESERVED8A = 38,
    BIUPE_RESERVED9A = 39,
    BIUPE_REQUESTS = 40,
    BIUPE_IOC_MAST_READS = 41,
    BIUPE_IOC_MAST_WRITES = 42,
    BIUPE_IOC_MAST_EOIS = 43,
    BIUPE_IOC_MAST_REQUEST_READ_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 44,
    BIUPE_IOC_MAST_REQUEST_READ_DATA_BYTES_16_BYTE_AND_MORE_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 45,
    BIUPE_IOC_MAST_REQUEST_WRITE_DATA_BYTES_8_BYTE_AND_LESS_TRANSFERS_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 46,
    BIUPE_IOC_MAST_REQUEST_WRITE_DATA_BYTES_16_BYTE_AND_MORE_TRANSFER_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER = 47,
    BIUPE_RESERVED10 = 48,
    BIUPE_RESERVED11 = 49,
    BIUPE_IOC_SLV_REQUESTS = 50,
    BIUPE_IOC_SLV_SNOOP_READS = 51,
    BIUPE_IOC_SLV_SNOOP_WRITES = 52,
    BIUPE_IOC_SLV_INTERRUPTS = 53,
    BIUPE_IOC_SLV_SYNCS = 54,
    BIUPE_IOC_SLV_SNOOP_READS_NS = 55,
    BIUPE_IOC_SLV_SNOOP_WRITES_NS = 56,
    BIUPE_IOC_SLV_CPUWBS = 57,
    BIUPE_IOC_SLV_REQUEST_READ_DATA_BYTES_DIVIDED_BY_4_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 58,
    BIUPE_IOC_SLV_REQUEST_WRITE_DATA_BYTES_DIVIDED_BY_4_DEPENDENT_ON_SECONDARY_CONTROL_REGISTER_ = 59,
    BIUPE_IOC_SLV_RESPONSES_BEATS_ = 60,
    BIUPE_IOC_SLV_RESP_READ_DATA_BYTES_DIVIDED_BY_4_ = 61,
    BIUPE_IOC_SLV_SYNCCLEANS = 62,
    BIUPE_RESERVED12 = 63,
    BIUPE_RESERVED13 = 64,
    BIUPE_RESERVED14 = 65,
    BIUPE_RESERVED15 = 66,
    BIUPE_RESERVED16 = 67,
    BIUPE_RESERVED17 = 68,
    BIUPE_RESERVED18 = 69,
    BIUPE_MC_0_FAST_PATH_CPU_READS = 70,
    BIUPE_MC_0_ORDERED_CPU_PATH_READS = 71,
    BIUPE_MC_0_ORDERED_CPU_PATH_WRITES = 72,
    BIUPE_MC_0_ORDERED_IOC_PATH_READS = 73,
    BIUPE_MC_0_ORDERED_IOC_PATH_WRITES = 74,
    BIUPE_MC_0_SYNC_A = 75,
    BIUPE_MC_0_SYNC_B = 76,
    BIUPE_RESERVED19 = 77,
    BIUPE_RESERVED20 = 78,
    BIUPE_RESERVED21 = 79,
    BIUPE_MC_1_FAST_PATH_CPU_READS = 80,
    BIUPE_MC_1_ORDERED_CPU_PATH_READS = 81,
    BIUPE_MC_1_ORDERED_CPU_PATH_WRITES = 82,
    BIUPE_MC_1_ORDERED_IOC_PATH_READS = 83,
    BIUPE_MC_1_ORDERED_IOC_PATH_WRITES = 84,
    BIUPE_MC_1_SYNC_A = 85,
    BIUPE_MC_1_SYNC_B = 86,
    BIUPE_MCQF_NEARFULL  = 100,
    BIUPE_MCQDF_NEARFULL  = 101,
    BIUPE_IMQF_NEARFULL  = 102,
    BIUPE_PSF_NEARFULL  = 103,
    BIUPE_M0SF_NEARFULL  = 104,
    BIUPE_M1SF_NEARFULL  = 105,
    BIUPE_IMSF_NEARFULL  = 106,
    BIUPE_IMSDF_NEARFULL  = 107,
    BIUPE_ISQF_NEARFULL  = 108,
    BIUPE_ISSF_NEARFULL  = 109,
    BIUPE_ISYF_NEARFULL  = 110,
    BIUPE_CPYF_NEARFULL  = 111,
    BIUPE_PQF_NEARFULL  = 112,
    BIUPE_TOTAL_MEMORY_LATENCY_FAST_READS_MUST_BE_DIVIDED_BY_MEM_READ_REQUESTS_FOR_AVERAGE_LATENCY  = 140,
    BIUPE_TOTAL_MEMORY_LATENCY_ORDERED_CPU_READS_MUST_BE_DIVIDED_BY_MEM_READ_REQUESTS_FOR_AVERAGE_LATENCY  = 141,
    BIUPE_TOTAL_MEMORY_LATENCY_ORDERED_IOC_READS_MUST_BE_DIVIDED_BY_MEM_READ_REQUESTS_FOR_AVERAGE_LATENCY  = 142,
    BIUPE_TOTAL_FLUSH_LATENCY_FOR_SNOOP_READS_MUST_BE_DIVIED_BY_FLUSH_REQUESTS_FOR_AVERAGE_LATENCY  = 143,
    BIUPE_TOTAL_FLUSH_LATENCY_FOR_SNOOP_WRITES_MUST_BE_DIVIED_BY_FLUSH_REQUESTS_FOR_AVERAGE_LATENCY  = 144,
    BIUPE_SNOOP_COMPLETION_BUFFER_FULL  = 145,
    BIUPE_RESERVED62  = 146,
    BIUPE_RESERVED63  = 147,
    BIUPE_RESERVED64  = 148,
    BIUPE_RESERVED65  = 149,
    BIUPE_MC0_SLOW_CREDIT_COUNT_IS_ZERO  = 150,
    BIUPE_MC1_SLOW_CREDIT_COUNT_IS_ZERO  = 151,
    BIUPE_MC0_FAST_CREDIT_COUNT_IS_ZERO  = 152,
    BIUPE_MC1_FAST_CREDIT_COUNT_IS_ZERO  = 153,
    BIUPE_RESERVED66  = 154,
    BIUPE_RESERVED67  = 155,
    BIUPE_FSB_SLV_REQ_STALL_FOR_COMMAND_CREDITS  = 156,
    BIUPE_FSB_SLV_REQ_STALL_FOR_DATA_CREDITS  = 157,
    BIUPE_FSB_PG_REQ_STALL_FOR_CMD_CREDITS  = 158,
    BIUPE_SCLK_COUNTS_SCLKS_IN_THE_MCLK_DOMAIN_  = 200,
    BIUPE_IOC_MASTER_REQ_STALL_BIU_HAS_DATA_BUT_IOC_NOT_READY_SCLK_DOMAIN_  = 201,
    BIUPE_IOC_SLAVE_RSP_STALL_SCLK_DOMAIN_  = 202,
    BIUPE_INCREMENT_BY_0_DISABLE_COUNT  = 252,
    BIUPE_INCREMENT_BY_1_USED_TO_COUNT_CLOCKS  = 253,
    BIUPE_INCREMENT_BY_63_USED_FOR_SIMULATION_TO_RAPIDLY_INCREMENT_COUNTER  = 254,
    BIUPE_OTHERS = 255, // Reserved
} BIUPERFEVENT;

typedef enum
{
    IOCPE_PERF_CYCLES_WAITING_FOR_REGISTER_BUS = 0,
    IOCPE_PERF_CYCLES_WAITING_FOR_BIU_SLV_REQ = 1,
    IOCPE_PERF_CYCLES_WAITING_FOR_BIU_MST_CPL = 2,
    IOCPE_PERF_CYCLES_WAITING_FOR_BSB_MST_REQ = 3,
    IOCPE_PERF_CYCLES_WAITING_FOR_MH_SLV_REQ = 4,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_REQUESTS = 5,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_READS = 6,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_WRITES = 7, // (includes interrupts)
    IOCPE_PERF_BSB_SLV_REQ_SNOOPED_READS = 8,
    IOCPE_PERF_BSB_SLV_REQ_SNOOPED_WRITES = 9, //includes interrupts, if snooped attr is true)
    IOCPE_PERF_BSB_SLV_REQ_NON_SNOOPED_READS = 10,
    IOCPE_PERF_BSB_SLV_REQ_NON_SNOOPED_WRITES = 11, //( includes interrupts, if non-snooped attr is true)
    IOCPE_PERF_BSB_SLV_REQ_INTERRUPTS = 12,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_REQUESTS_DWORDS = 13,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_READS_DWORDS = 14,
    IOCPE_PERF_BSB_SLV_REQ_TOTAL_WRITES_DWORDS = 15,
    IOCPE_PERF_BSB_SLV_REQ_SNOOPED_READS_DWORDS = 16,
    IOCPE_PERF_BSB_SLV_REQ_SNOOPED_WRITES_DWORDS = 17,
    IOCPE_PERF_BSB_SLV_REQ_NON_SNOOPED_READS_DWORDS = 18,
    IOCPE_PERF_BSB_SLV_REQ_NON_SNOOPED_WRITES_DWORDS = 19,
    IOCPE_PERF_BSB_SLV_READ_LATENCY_CYCLES = 20, // (snooped or non-snooped dependent upon secondary control register) (used to calc average latency with # read requests)
    IOCPE_PERF_BSB_SLV_CPL_TOTAL_READ_COMPLETIONS = 21,
    IOCPE_PERF_BSB_SLV_CPL_SNOOPED_READ_COMPLETIONS = 22,
    IOCPE_PERF_BSB_SLV_CPL_NON_SNOOPED_READ_COMPLETIONS = 23,
    IOCPE_PERF_BSB_SLV_CPL_TOTAL_READ_COMPLETIONS_DWORDS = 24,
    IOCPE_PERF_BSB_SLV_CPL_SNOOPED_READ_COMPLETIONS_DWORDS = 25,
    IOCPE_PERF_BSB_SLV_CPL_NON_SNOOPED_READ_COMPLETIONS_DWORDS = 26,
    IOCPE_PERF_BSB_SLV_CPL_URS = 27, // (counts ur's and ca's) (ca's don't happen in production mode),
    IOCPE_PERF_BIF_SLV_REQ_TOTAL_REQUESTS = 28,
    IOCPE_PERF_BIF_SLV_REQ_TOTAL_CP_WRITEBACKS = 29,
    IOCPE_PERF_BIF_SLV_REQ_TOTAL_SNOOPED_WRITES = 30,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_REQUESTS = 31,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_READS = 32,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_WRITES = 33,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_REQUESTS_DWORDS = 34,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_READS_DWORDS = 35,
    IOCPE_PERF_BSB_MST_REQ_TOTAL_WRITES_DWORDS = 36,
    IOCPE_PERF_BSB_MST_CPL_TOTAL_COMPLETIONS = 37,
    IOCPE_PERF_BSB_MST_CPL_TOTAL_COMPLETIONS_DWORDS = 38,
    IOCPE_PERF_REG_BUS_REQ_TOTAL_REQUESTS = 39,
    IOCPE_PERF_REG_BUS_REQ_TOTAL_READS = 40,
    IOCPE_PERF_REG_BUS_REQ_TOTAL_WRITES = 41,
    IOCPE_PERF_REG_BUS_CPL_READ_COMPLETIONS = 42,
    IOCPE_PERF_TOTAL_SYNCS_ISSUED = 43,
    IOCPE_PERF_BIU_SYNCS_ISSUED = 44,
    IOCPE_PERF_MH_SYNCS_ISSUED = 45,
    IOCPE_PERF_TOTAL_SYNCS_RECEIVED = 46,
    IOCPE_PERF_BIU_SYNCS_RECEIVED = 47,
    IOCPE_PERF_MH_SYNCS_RECEIVED = 48,
    IOCPE_PERF_FIFO_BSB_MST_REQ_FULL = 49,
    IOCPE_PERF_FIFO_BSB_MST_REQ_EMPTY = 50,
    IOCPE_PERF_FIFO_SYNC_FULL = 51,
    IOCPE_PERF_FIFO_SYNC_EMPTY = 52,
    IOCPE_PERF_FIFO_INTERRUPT_FULL = 53,
    IOCPE_PERF_FIFO_INTERRUPT_EMPTY = 54,
    IOCPE_PERF_FIFO_REG_FULL = 55,
    IOCPE_PERF_FIFO_REG_EMPTY = 56,
    IOCPE_PERF_FIFO_BIU_SLV_CPL_NEARFULL = 57,
    IOCPE_PERF_FIFO_BIU_SLV_CPL_EMPTY = 58,
    IOCPE_PERF_FIFO_MH_SLV_CPL_FULL = 59,
    IOCPE_PERF_FIFO_MH_SLV_CPL_EMPTY = 60,
    IOCPE_PERF_FIFO_BSB_SLV_REQ_NEARFULL = 61,
    IOCPE_PERF_FIFO_BSB_SLV_REQ_EMPTY = 62,
    IOCPE_PERF_FIFO_BSB_MST_CPL_NEARFULL = 63,
    IOCPE_PERF_FIFO_BSB_MST_CPL_EMPTY = 64,
    IOCPE_PERF_FIFO_BIU_MST_REQ_NEARFULL = 65,
    IOCPE_PERF_FIFO_BIU_MST_REQ_EMPTY = 66,
    IOCPE_PERF_FIFO_BIF_SLV_REQ_NEARFULL = 67,
    IOCPE_PERF_FIFO_BIF_SLV_REQ_EMPTY = 68,
    IOCPE_PERF_FIFO_READ_LATENCY_FULL = 69,
    IOCPE_PERF_FIFO_READ_LATENCY_EMPTY = 70,
    IOCPE_PERF_NUMBER_OF_SYSTEM_CLOCKS = 71,
    IOCPE_PERF_ZERO_COUNT = 72, //(disable count)
    IOCPE_PERF_SIXTY_THREE_COUNT = 73, // (sim rapidly inc counter)
    IOCPE_PERF_BIU_SPARE = 74, // (so dff ins don't disappear)
    IOCPE_PERF_BSB_SLV_REQ_MESSAGES = 75,
    IOCPE_PERF_BSB_MST_REQ_CONFIG_READS = 76, // (all 1 dword)
    IOCPE_PERF_BSB_MST_REQ_CONFIG_WRITES = 77, // (all 1 dword)
    IOCPE_PERF_BSB_MST_REQ_POSTED_WRITES = 78,
    IOCPE_PERF_BSB_MST_REQ_POSTED_WRITES_DWORDS = 79,
    IOCPE_PERF_BSB_MST_CPL_NON_POSTED_WRITES = 80, // (all 1 dword)
    IOCPE_PERF_CYCLES_WAITING_FOR_INT_SYNC = 81, // (total time waiting for any type of interrupt sync)
    IOCPE_PERF_CYCLES_WAITING_FOR_INT_SYNC_SNOOP_PATH = 82, // (time waiting for interrupts that only caused syncs to biu)
    IOCPE_PERF_CYCLES_WAITING_FOR_INT_SYNC_NON_SNOOP_PATH = 83, // (time waiting for interrupts that only caused syncs to mh)
    IOCPE_PERF_CYCLES_WAITING_FOR_INT_SYNC_BOTH_PATHS = 84, // (time waiting for interrupts that caused syncs to mh and biu)
    IOCPE_PERF_CYCLES_WAITING_FOR_MST_RD_CPL_SYNC = 85, // (total time waiting for any type of mst cpl sync)
    IOCPE_PERF_CYCLES_WAITING_FOR_MST_RD_CPL_SYNC_SNOOP_PATH = 86, // (time waiting for mst cpls that only caused syncs to biu)
    IOCPE_PERF_CYCLES_WAITING_FOR_MST_RD_CPL_SYNC_NON_SNOOP_PATH = 87, // (time waiting for mst cpls that only caused syncs to mh)
    IOCPE_PERF_CYCLES_WAITING_FOR_MST_RD_CPL_SYNC_BOTH_PATHS = 88, // (time waiting for mst cpls that caused syncs to mh and biu)
    IOCPE_PERF_CYCLES_WAITING_FOR_NSR_PASS_SW = 89,
    IOCPE_PERF_CYCLES_WAITING_FOR_SR_PASS_NSW = 90,
    IOCPE_PERF_CYCLES_WAITING_FOR_NSW_PASS_SW = 91,
    IOCPE_PERF_CYCLES_WAITING_FOR_SW_PASS_NSW = 92,
    IOCPE_PERF_FIFO_MST_CPL_SYNC_TYPE_FIFO_FULL = 93,
    IOCPE_PERF_FIFO_MST_CPL_SYNC_TYPE_FIFO_EMPTY = 94,
    IOCPE_PERF_FIFO_BSB_SNOOP_SLV_REQ_FULL = 95,
    IOCPE_PERF_FIFO_BSB_SNOOP_SLV_REQ_EMPTY = 96,
    IOCPE_PERF_FIFO_BSB_SNOOP_SLV_REQ_DATA_FULL = 97,
    IOCPE_PERF_FIFO_BSB_SNOOP_SLV_REQ_DATA_EMPTY = 98,
    IOCPE_PERF_FIFO_READ_LATENCY_BAD_REQ_FULL = 99,
    IOCPE_PERF_FIFO_READ_LATENCY_BAD_REQ_EMPTY = 100,
    IOCPE_PERF_FIFO_READ_LATENCY_NON_SNOOP_CAM_ADDRESS_FULL = 101,
    IOCPE_PERF_FIFO_READ_LATENCY_NON_SNOOP_CAM_ADDRESS_EMPTY = 102,
} IOCPERFEVENT;

//------------------------------------------------------------------------------
// GPU performance counter register offsets

typedef enum
{
    GPUPERFREG_PA_SU_PERFCOUNTER0_SELECT = 0x0C88,
    GPUPERFREG_PA_SU_PERFCOUNTER1_SELECT = 0x0C89,
    GPUPERFREG_PA_SU_PERFCOUNTER2_SELECT = 0x0C8A,
    GPUPERFREG_PA_SU_PERFCOUNTER3_SELECT = 0x0C8B,
    GPUPERFREG_PA_SC_PERFCOUNTER0_SELECT = 0x0C98,
    GPUPERFREG_PA_SC_PERFCOUNTER1_SELECT = 0x0C99,
    GPUPERFREG_PA_SC_PERFCOUNTER2_SELECT = 0x0C9A,
    GPUPERFREG_PA_SC_PERFCOUNTER3_SELECT = 0x0C9B,
    GPUPERFREG_VGT_PERFCOUNTER0_SELECT = 0x0C48,
    GPUPERFREG_VGT_PERFCOUNTER1_SELECT = 0x0C49,
    GPUPERFREG_VGT_PERFCOUNTER2_SELECT = 0x0C4A,
    GPUPERFREG_VGT_PERFCOUNTER3_SELECT = 0x0C4B,
    GPUPERFREG_TCR_PERFCOUNTER0_SELECT = 0x0E05,
    GPUPERFREG_TCR_PERFCOUNTER1_SELECT = 0x0E08,
    GPUPERFREG_TP0_PERFCOUNTER0_SELECT = 0x0E1F,
    GPUPERFREG_TP0_PERFCOUNTER1_SELECT = 0x0E22,
    GPUPERFREG_TP1_PERFCOUNTER0_SELECT = 0x0E28,
    GPUPERFREG_TP1_PERFCOUNTER1_SELECT = 0x0E2B,
    GPUPERFREG_TP2_PERFCOUNTER0_SELECT = 0x0E31,
    GPUPERFREG_TP2_PERFCOUNTER1_SELECT = 0x0E34,
    GPUPERFREG_TP3_PERFCOUNTER0_SELECT = 0x0E3A,
    GPUPERFREG_TP3_PERFCOUNTER1_SELECT = 0x0E3D,
    GPUPERFREG_TCM_PERFCOUNTER0_SELECT = 0x0E54,
    GPUPERFREG_TCM_PERFCOUNTER1_SELECT = 0x0E57,
    GPUPERFREG_TCF_PERFCOUNTER0_SELECT = 0x0E5A,
    GPUPERFREG_TCF_PERFCOUNTER1_SELECT = 0x0E5D,
    GPUPERFREG_TCF_PERFCOUNTER2_SELECT = 0x0E60,
    GPUPERFREG_TCF_PERFCOUNTER3_SELECT = 0x0E63,
    GPUPERFREG_TCF_PERFCOUNTER4_SELECT = 0x0E66,
    GPUPERFREG_TCF_PERFCOUNTER5_SELECT = 0x0E69,
    GPUPERFREG_TCF_PERFCOUNTER6_SELECT = 0x0E6C,
    GPUPERFREG_TCF_PERFCOUNTER7_SELECT = 0x0E6F,
    GPUPERFREG_TCF_PERFCOUNTER8_SELECT = 0x0E72,
    GPUPERFREG_TCF_PERFCOUNTER9_SELECT = 0x0E75,
    GPUPERFREG_TCF_PERFCOUNTER10_SELECT = 0x0E78,
    GPUPERFREG_TCF_PERFCOUNTER11_SELECT = 0x0E7B,
    GPUPERFREG_VC_PERFCOUNTER0_SELECT = 0x0E48,
    GPUPERFREG_VC_PERFCOUNTER1_SELECT = 0x0E4B,
    GPUPERFREG_VC_PERFCOUNTER2_SELECT = 0x0E4E,
    GPUPERFREG_VC_PERFCOUNTER3_SELECT = 0x0E51,
    GPUPERFREG_SQ_PERFCOUNTER0_SELECT = 0x0DC8,
    GPUPERFREG_SQ_PERFCOUNTER1_SELECT = 0x0DC9,
    GPUPERFREG_SQ_PERFCOUNTER2_SELECT = 0x0DCA,
    GPUPERFREG_SQ_PERFCOUNTER3_SELECT = 0x0DCB,
    GPUPERFREG_SX_PERFCOUNTER0_SELECT = 0x0DD4,
    GPUPERFREG_MC0_PERFCOUNTER0_SELECT = 0x0815,
    GPUPERFREG_MC1_PERFCOUNTER0_SELECT = 0x0855,
    GPUPERFREG_MH_PERFCOUNTER0_SELECT = 0x0A18,
    GPUPERFREG_MH_PERFCOUNTER1_SELECT = 0x0A1B,
    GPUPERFREG_MH_PERFCOUNTER2_SELECT = 0x0A1E,
    GPUPERFREG_BIF_PERFCOUNTER0_SELECT = 0x0048,
    GPUPERFREG_HZ_PERFCOUNTER0_SELECT = 0x1004,
    GPUPERFREG_HZ_PERFCOUNTER1_SELECT = 0x1007,
    GPUPERFREG_BC_PERFCOUNTER0_SELECT = 0x0F04,
    GPUPERFREG_BC_PERFCOUNTER1_SELECT = 0x0F05,
    GPUPERFREG_BC_PERFCOUNTER2_SELECT = 0x0F06,
    GPUPERFREG_BC_PERFCOUNTER3_SELECT = 0x0F07,
    GPUPERFREG_RBBM_PERFCOUNTER0_SELECT = 0x0395,
    GPUPERFREG_RBBM_PERFCOUNTER1_SELECT = 0x0396,
    GPUPERFREG_CP_PERFCOUNTER0_SELECT = 0x01E6,
    GPUPERFREG_PA_SU_PERFCOUNTER0_LOW = 0x0C8C,
    GPUPERFREG_PA_SU_PERFCOUNTER0_HI = 0x0C8D,
    GPUPERFREG_PA_SU_PERFCOUNTER1_LOW = 0x0C8E,
    GPUPERFREG_PA_SU_PERFCOUNTER1_HI = 0x0C8F,
    GPUPERFREG_PA_SU_PERFCOUNTER2_LOW = 0x0C90,
    GPUPERFREG_PA_SU_PERFCOUNTER2_HI = 0x0C91,
    GPUPERFREG_PA_SU_PERFCOUNTER3_LOW = 0x0C92,
    GPUPERFREG_PA_SU_PERFCOUNTER3_HI = 0x0C93,
    GPUPERFREG_PA_SC_PERFCOUNTER0_LOW = 0x0C9C,
    GPUPERFREG_PA_SC_PERFCOUNTER0_HI = 0x0C9D,
    GPUPERFREG_PA_SC_PERFCOUNTER1_LOW = 0x0C9E,
    GPUPERFREG_PA_SC_PERFCOUNTER1_HI = 0x0C9F,
    GPUPERFREG_PA_SC_PERFCOUNTER2_LOW = 0x0CA0,
    GPUPERFREG_PA_SC_PERFCOUNTER2_HI = 0x0CA1,
    GPUPERFREG_PA_SC_PERFCOUNTER3_LOW = 0x0CA2,
    GPUPERFREG_PA_SC_PERFCOUNTER3_HI = 0x0CA3,
    GPUPERFREG_VGT_PERFCOUNTER0_LOW = 0x0C4C,
    GPUPERFREG_VGT_PERFCOUNTER0_HI = 0x0C4D,
    GPUPERFREG_VGT_PERFCOUNTER1_LOW = 0x0C4E,
    GPUPERFREG_VGT_PERFCOUNTER1_HI = 0x0C4F,
    GPUPERFREG_VGT_PERFCOUNTER2_LOW = 0x0C50,
    GPUPERFREG_VGT_PERFCOUNTER2_HI = 0x0C51,
    GPUPERFREG_VGT_PERFCOUNTER3_LOW = 0x0C52,
    GPUPERFREG_VGT_PERFCOUNTER3_HI = 0x0C53,
    GPUPERFREG_TCR_PERFCOUNTER0_LOW = 0x0E07,
    GPUPERFREG_TCR_PERFCOUNTER0_HI = 0x0E06,
    GPUPERFREG_TCR_PERFCOUNTER1_LOW = 0x0E0A,
    GPUPERFREG_TCR_PERFCOUNTER1_HI = 0x0E09,
    GPUPERFREG_TP0_PERFCOUNTER0_LOW = 0x0E21,
    GPUPERFREG_TP0_PERFCOUNTER0_HI = 0x0E20,
    GPUPERFREG_TP0_PERFCOUNTER1_LOW = 0x0E24,
    GPUPERFREG_TP0_PERFCOUNTER1_HI = 0x0E23,
    GPUPERFREG_TP1_PERFCOUNTER0_LOW = 0x0E2A,
    GPUPERFREG_TP1_PERFCOUNTER0_HI = 0x0E29,
    GPUPERFREG_TP1_PERFCOUNTER1_LOW = 0x0E2D,
    GPUPERFREG_TP1_PERFCOUNTER1_HI = 0x0E2C,
    GPUPERFREG_TP2_PERFCOUNTER0_LOW = 0x0E33,
    GPUPERFREG_TP2_PERFCOUNTER0_HI = 0x0E32,
    GPUPERFREG_TP2_PERFCOUNTER1_LOW = 0x0E36,
    GPUPERFREG_TP2_PERFCOUNTER1_HI = 0x0E35,
    GPUPERFREG_TP3_PERFCOUNTER0_LOW = 0x0E3C,
    GPUPERFREG_TP3_PERFCOUNTER0_HI = 0x0E3B,
    GPUPERFREG_TP3_PERFCOUNTER1_LOW = 0x0E3F,
    GPUPERFREG_TP3_PERFCOUNTER1_HI = 0x0E3E,
    GPUPERFREG_TCM_PERFCOUNTER0_LOW = 0x0E56,
    GPUPERFREG_TCM_PERFCOUNTER0_HI = 0x0E55,
    GPUPERFREG_TCM_PERFCOUNTER1_LOW = 0x0E59,
    GPUPERFREG_TCM_PERFCOUNTER1_HI = 0x0E58,
    GPUPERFREG_TCF_PERFCOUNTER0_LOW = 0x0E5C,
    GPUPERFREG_TCF_PERFCOUNTER0_HI = 0x0E5B,
    GPUPERFREG_TCF_PERFCOUNTER1_LOW = 0x0E5F,
    GPUPERFREG_TCF_PERFCOUNTER1_HI = 0x0E5E,
    GPUPERFREG_TCF_PERFCOUNTER2_LOW = 0x0E62,
    GPUPERFREG_TCF_PERFCOUNTER2_HI = 0x0E61,
    GPUPERFREG_TCF_PERFCOUNTER3_LOW = 0x0E65,
    GPUPERFREG_TCF_PERFCOUNTER3_HI = 0x0E64,
    GPUPERFREG_TCF_PERFCOUNTER4_LOW = 0x0E68,
    GPUPERFREG_TCF_PERFCOUNTER4_HI = 0x0E67,
    GPUPERFREG_TCF_PERFCOUNTER5_LOW = 0x0E6B,
    GPUPERFREG_TCF_PERFCOUNTER5_HI = 0x0E6A,
    GPUPERFREG_TCF_PERFCOUNTER6_LOW = 0x0E6E,
    GPUPERFREG_TCF_PERFCOUNTER6_HI = 0x0E6D,
    GPUPERFREG_TCF_PERFCOUNTER7_LOW = 0x0E71,
    GPUPERFREG_TCF_PERFCOUNTER7_HI = 0x0E70,
    GPUPERFREG_TCF_PERFCOUNTER8_LOW = 0x0E74,
    GPUPERFREG_TCF_PERFCOUNTER8_HI = 0x0E73,
    GPUPERFREG_TCF_PERFCOUNTER9_LOW = 0x0E77,
    GPUPERFREG_TCF_PERFCOUNTER9_HI = 0x0E76,
    GPUPERFREG_TCF_PERFCOUNTER10_LOW = 0x0E7A,
    GPUPERFREG_TCF_PERFCOUNTER10_HI = 0x0E79,
    GPUPERFREG_TCF_PERFCOUNTER11_LOW = 0x0E7D,
    GPUPERFREG_TCF_PERFCOUNTER11_HI = 0x0E7C,
    GPUPERFREG_VC_PERFCOUNTER0_LOW = 0x0E4A,
    GPUPERFREG_VC_PERFCOUNTER0_HI = 0x0E49,
    GPUPERFREG_VC_PERFCOUNTER1_LOW = 0x0E4D,
    GPUPERFREG_VC_PERFCOUNTER1_HI = 0x0E4C,
    GPUPERFREG_VC_PERFCOUNTER2_LOW = 0x0E50,
    GPUPERFREG_VC_PERFCOUNTER2_HI = 0x0E4F,
    GPUPERFREG_VC_PERFCOUNTER3_LOW = 0x0E53,
    GPUPERFREG_VC_PERFCOUNTER3_HI = 0x0E52,
    GPUPERFREG_SQ_PERFCOUNTER0_LOW = 0x0DCC,
    GPUPERFREG_SQ_PERFCOUNTER0_HI = 0x0DCD,
    GPUPERFREG_SQ_PERFCOUNTER1_LOW = 0x0DCE,
    GPUPERFREG_SQ_PERFCOUNTER1_HI = 0x0DCF,
    GPUPERFREG_SQ_PERFCOUNTER2_LOW = 0x0DD0,
    GPUPERFREG_SQ_PERFCOUNTER2_HI = 0x0DD1,
    GPUPERFREG_SQ_PERFCOUNTER3_LOW = 0x0DD2,
    GPUPERFREG_SQ_PERFCOUNTER3_HI = 0x0DD3,
    GPUPERFREG_SX_PERFCOUNTER0_LOW = 0x0DD8,
    GPUPERFREG_SX_PERFCOUNTER0_HI = 0x0DD9,
    GPUPERFREG_MC0_PERFCOUNTER0_LOW = 0x0817,
    GPUPERFREG_MC0_PERFCOUNTER0_HI = 0x0816,
    GPUPERFREG_MC1_PERFCOUNTER0_LOW = 0x0857,
    GPUPERFREG_MC1_PERFCOUNTER0_HI = 0x0856,
    GPUPERFREG_MH_PERFCOUNTER0_LOW = 0x0A1A,
    GPUPERFREG_MH_PERFCOUNTER0_HI = 0x0A19,
    GPUPERFREG_MH_PERFCOUNTER1_LOW = 0x0A1D,
    GPUPERFREG_MH_PERFCOUNTER1_HI = 0x0A1C,
    GPUPERFREG_MH_PERFCOUNTER2_LOW = 0x0A20,
    GPUPERFREG_MH_PERFCOUNTER2_HI = 0x0A1F,
    GPUPERFREG_BIF_PERFCOUNTER0_LOW = 0x004A,
    GPUPERFREG_BIF_PERFCOUNTER0_HI = 0x0049,
    GPUPERFREG_HZ_PERFCOUNTER0_LOW = 0x1006,
    GPUPERFREG_HZ_PERFCOUNTER0_HI = 0x1005,
    GPUPERFREG_HZ_PERFCOUNTER1_LOW = 0x1009,
    GPUPERFREG_HZ_PERFCOUNTER1_HI = 0x1008,
    GPUPERFREG_BC_PERFCOUNTER0_LOW = 0x0F08,
    GPUPERFREG_BC_PERFCOUNTER0_HI = 0x0F09,
    GPUPERFREG_BC_PERFCOUNTER1_LOW = 0x0F0A,
    GPUPERFREG_BC_PERFCOUNTER1_HI = 0x0F0B,
    GPUPERFREG_BC_PERFCOUNTER2_LOW = 0x0F0C,
    GPUPERFREG_BC_PERFCOUNTER2_HI = 0x0F0D,
    GPUPERFREG_BC_PERFCOUNTER3_LOW = 0x0F0E,
    GPUPERFREG_BC_PERFCOUNTER3_HI = 0x0F0F,
    GPUPERFREG_RBBM_PERFCOUNTER0_LOW = 0x0397,
    GPUPERFREG_RBBM_PERFCOUNTER0_HI = 0x0398,
    GPUPERFREG_RBBM_PERFCOUNTER1_LOW = 0x0399,
    GPUPERFREG_RBBM_PERFCOUNTER1_HI = 0x039A,
    GPUPERFREG_CP_PERFCOUNTER0_LOW = 0x01E7,
    GPUPERFREG_CP_PERFCOUNTER0_HI = 0x01E8,

    GPUPERFREG_CP_PERFMON_CNTL = 0x01F5,
    GPUPERFREG_VGT_EVENT_INITIATOR = 0x21F9,
} GPUPERFREGISTER;

typedef enum
{
    BIUPERFREG_BIU_PERFCOUNTER0_SELECT = 0x408041,
    BIUPERFREG_BIU_PERFCOUNTER1_SELECT = 0x408044,
    BIUPERFREG_BIU_PERFCOUNTER2_SELECT = 0x408047,
    BIUPERFREG_BIU_PERFCOUNTER3_SELECT = 0x40804A,
    BIUPERFREG_BIU_PERFCOUNTER0_LOW = 0x408043,
    BIUPERFREG_BIU_PERFCOUNTER0_HI = 0x408042,
    BIUPERFREG_BIU_PERFCOUNTER1_LOW = 0x408046,
    BIUPERFREG_BIU_PERFCOUNTER1_HI = 0x408045,
    BIUPERFREG_BIU_PERFCOUNTER2_LOW = 0x408049,
    BIUPERFREG_BIU_PERFCOUNTER2_HI = 0x408048,
    BIUPERFREG_BIU_PERFCOUNTER3_LOW = 0x40804C,
    BIUPERFREG_BIU_PERFCOUNTER3_HI = 0x40804B,

    BIUPERFREG_BIU_PERFMON_CNTL = 0x408040,
} BIUPERFREGISTER;

typedef enum
{
    DCPERFREG_DC_PERFCOUNTER0_SELECT = 0x1FC8,
    DCPERFREG_DC_PERFCOUNTER1_SELECT = 0x1FCB,

    DCPERFREG_DC_PERFCOUNTER0_LOW = 0x1FCA,
    DCPERFREG_DC_PERFCOUNTER0_HI = 0x1FC9,
    DCPERFREG_DC_PERFCOUNTER1_LOW = 0x1FCD,
    DCPERFREG_DC_PERFCOUNTER1_HI = 0x1FCC,
} DCPERFREGISTER;

typedef enum
{
    IOCPERFREG_IOC_PERFCOUNTER0_SELECT = 0x410081,
    IOCPERFREG_IOC_PERFCOUNTER1_SELECT = 0x410084,
    IOCPERFREG_IOC_PERFCOUNTER2_SELECT = 0x410087,
    IOCPERFREG_IOC_PERFCOUNTER3_SELECT = 0x41008A,
    IOCPERFREG_IOC_PERFCOUNTER0_LOW = 0x410083,
    IOCPERFREG_IOC_PERFCOUNTER0_HI = 0x410082,
    IOCPERFREG_IOC_PERFCOUNTER1_LOW = 0x410086,
    IOCPERFREG_IOC_PERFCOUNTER1_HI = 0x410085,
    IOCPERFREG_IOC_PERFCOUNTER2_LOW = 0x410089,
    IOCPERFREG_IOC_PERFCOUNTER2_HI = 0x410088,
    IOCPERFREG_IOC_PERFCOUNTER3_LOW = 0x41008C,
    IOCPERFREG_IOC_PERFCOUNTER3_HI = 0x41008B,

    IOCPERFREG_IOC_PERFMON_CNTL = 0x410080,
} IOCPERFREGISTER;


//------------------------------------------------------------------------------
// GPU performance counter register defines

typedef union
{
    struct {
        DWORD Select                    : 8;
        DWORD N                         : 8;
        DWORD                           : 16;
    };
    DWORD dword;
} GPUPERFCOUNTER_SELECT;

typedef union {
    struct {
        DWORD Low                       : 32;
        DWORD High                      : 16;
        DWORD                           : 16;

    };
    ULARGE_INTEGER qword;
} GPUPERFCOUNTER_VALUE;

typedef union {
    struct {
        DWORD Low                       : 32;
        DWORD High                      : 16;
        DWORD                           : 16;
    };
    ULARGE_INTEGER qword;
} DCPERFCOUNTER_VALUE;

typedef union {
    struct {
        DWORD Low                       : 32;
        DWORD High                      : 16;
        DWORD                           : 16;


    };
    ULARGE_INTEGER qword;
} BIUPERFCOUNTER_VALUE;

typedef union {
    struct {
        DWORD Low                       : 32;
        DWORD High                      : 16;
        DWORD                           : 16;
    };
    ULARGE_INTEGER qword;
} IOCPERFCOUNTER_VALUE;

typedef union {
    struct {
        DWORD State                     : 4;
        DWORD                           : 4;
        DWORD EnableMode                : 2;
        DWORD                           : 22;
    };
    DWORD dword;
} GPUPERFCOUNTER_CNTL;

typedef union {
    struct {
        DWORD State                     : 4;
        DWORD                           : 4;
        DWORD EnableMode                : 2;
        DWORD                           : 22;
    };
    DWORD dword;
} DCPERFCOUNTER_CNTL;

typedef union {
    struct {
        DWORD State                     : 3;
        DWORD                           : 29;
    };
    DWORD dword;
} BIUPERFCOUNTER_CNTL;

typedef union {
    struct {
        DWORD State                     : 4;
        DWORD                           : 28;
    };
    DWORD dword;
} IOCPERFCOUNTER_CNTL;


#if defined(_M_PPCBE)
#pragma bitfield_order(pop)
#endif

#pragma warning(pop)

#ifdef __cplusplus
};
#endif

#endif /* _D3D9GPU_H_ */