Developing Games on Intel Graphics
If you are gaming on graphics integrated in your Intel Processor, this is the place for you! Find answers to your questions or post your issues with PC games
486 Discussions

OpenGL - glTexSubImage2D on Intel 4 Series/Windows 7 Performance

mfah
Novice
1,052 Views
glTexSubImage2D is unacceptably slow with an Intel Series 4 on Windows 7, driver version 8.15.10.2141 (latest). Benchmarked with the following code:
	DWORD start = timeGetTime ();

	for (int i = 0; i < NUM_TEXIMAGE; i++)
	{
		glBindTexture (GL_TEXTURE_2D, teximage);
		glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, sibuffer);
	}

	DWORD end = timeGetTime ();
A single call requires some 20ms to return for a 512x512 texture; the code above (TEX_WIDTH is 64, TEX_HEIGHT is 512, NUM_TEXIMAGE is 16, timeBeginPeriod (1)) requires ~45ms, compared to < 5ms on an Intel 945, older driver rev, Windows XP. Have ensured formats match and have attempted use of PBO with no measurable performance difference. PFD_SUPPORT_COMPOSITION makes no difference

Is there a pullback from texture memory to system memory happening here?
0 Kudos
1 Solution
mfah
Novice
1,052 Views
The following test app (SDL, mostly portable, compiles with MSVC 2008) can be used to test/verify this issue:
#define WINDOW_WIDTH    800
#define WINDOW_HEIGHT    600

#include "SDL.h"
#include "SDL_opengl.h"

#pragma comment (lib, "SDL.lib")
#pragma comment (lib, "SDLmain.lib")
#pragma comment (lib, "opengl32.lib")

#define TEX_WIDTH 512
#define TEX_HEIGHT 512

unsigned int sibuffer[TEX_WIDTH * TEX_HEIGHT];
unsigned int teximage = 0;
unsigned int framecount = 0;

// find the fastest modes to use for glTexSubImage2D
typedef struct tsitest_s
{
    char formatstr[64];
    char typestr[64];
    GLenum format;
    GLenum type;
    int modespeed;
    bool failed;
} tsitest_t;

tsitest_t tsimodes[] =
{
    {"GL_RGBA", "GL_UNSIGNED_BYTE", GL_RGBA, GL_UNSIGNED_BYTE, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_BYTE", GL_BGRA, GL_UNSIGNED_BYTE, 666, true},
    {"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
    {"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true}
};


int fastest = 666;
int bestspeed = 32768;

GLuint R_MakeMeATexture (int width, int height, GLenum format, GLenum type)
{
    GLuint texnum = 0;

    glEnable (GL_TEXTURE_2D);
    glGenTextures (1, &texnum);
    glBindTexture (GL_TEXTURE_2D, texnum);
    glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D (GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, format, type, NULL);

    // commit the buffer so that timings are valid
    glFinish ();

    return texnum;
}


void R_SetTSIMode (void)
{
    int numtsimodes = sizeof (tsimodes) / sizeof (tsitest_t);

    for (int i = 0; i < numtsimodes; i++)
    {
        // clear last the error (if any)
        glGetError ();

        // create a new texture object
        GLuint texnum = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes.format, tsimodes.type);

        Uint32 start = SDL_GetTicks ();

        // SDL_GetTicks has insufficient resolution to measure one call so we need to run a few of them
        for (int t = 0; t < 16; t++)
            glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes.format, tsimodes.type, sibuffer);

        Uint32 end = SDL_GetTicks ();

        // commit the buffer so that timings are valid
        glFinish ();

        glDeleteTextures (1, &texnum);

        if (glGetError () != GL_NO_ERROR)
            tsimodes.failed = true;
        else tsimodes.failed = false;

        tsimodes.modespeed = (end - start);
    }

    for (int i = 0; i < numtsimodes; i++)
    {
        printf ("mode: %i %4ims [%s/%s] (%s)\n", i, tsimodes.modespeed, tsimodes.formatstr, 
            tsimodes.typestr, tsimodes.failed ? "FAILED" : "OK");

        if (tsimodes.modespeed <= bestspeed && !tsimodes.failed)
        {
            bestspeed = tsimodes.modespeed;
            fastest = i;
        }
    }

    if (fastest == 666)
    {
        MessageBox (NULL, "Failed to find a format!", "Error", MB_OK | MB_ICONSTOP);
        exit (0);
    }
}


void RenderOpenGL (void)
{
    framecount++;

    for (int i = 0, w = 0; w < TEX_WIDTH; w++)
    {
        for (int h = 0; h < TEX_HEIGHT; h++, i++)
        {
            unsigned char *rgba = (unsigned char *) &sibuffer;

            // 2 == red, 1 == green, 0 == blue
            rgba[2] = ((h * i) + framecount) & 255;
            rgba[1] = ((w * h) + framecount) & 255;
            rgba[0] = ((w * i) + framecount) & 255;
            rgba[3] = 255;
        }
    }

    glClear (GL_COLOR_BUFFER_BIT);

    glViewport (0, 0, WINDOW_WIDTH, WINDOW_HEIGHT);

    glMatrixMode (GL_MODELVIEW);
    glLoadIdentity ();

    glMatrixMode (GL_PROJECTION);
    glLoadIdentity ();
    glOrtho (0, WINDOW_WIDTH, WINDOW_HEIGHT, 0, -99999, 99999);

    glBindTexture (GL_TEXTURE_2D, teximage);
    glTexEnvi (GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);

    glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type, sibuffer);

    glBegin (GL_QUADS);

    glTexCoord2f (0, 0);
    glVertex2f (0, 0);

    glTexCoord2f (1, 0);
    glVertex2f (TEX_WIDTH, 0);

    glTexCoord2f (1, 1);
    glVertex2f (TEX_WIDTH, TEX_HEIGHT);

    glTexCoord2f (0, 1);
    glVertex2f (0, TEX_HEIGHT);

    glEnd ();
}


int main (int argc, char *argv[])
{
    if (SDL_Init (SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) != 0)
    {
        printf ("Unable to initialize SDL: %s\n", SDL_GetError ());
        return 1;
    }

    SDL_GL_SetAttribute (SDL_GL_DOUBLEBUFFER, 1);
    SDL_Surface *screen = SDL_SetVideoMode (WINDOW_WIDTH, WINDOW_HEIGHT, 32, SDL_OPENGL);

    R_SetTSIMode ();
    teximage = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type);
    glClearColor (0, 0, 0, 1);

    int done = 0;
    SDL_Event evt;

    while (!done)
    {
        while (!done && SDL_PollEvent (&evt))
        {
            if (evt.type == SDL_QUIT)
            {
                done = 1;
                break;
            }
        }

        // run the screen update here
        RenderOpenGL ();
        SDL_GL_SwapBuffers ();
    }

    return 0;
}

View solution in original post

0 Kudos
5 Replies
mfah
Novice
1,052 Views
Update:

Internal Format: GL_RGBA
Format: GL_BGRA
Type: GL_UNSIGNED_INT_8_8_8_8_REV

Problem completely goes away.

I suspect that the driver was pulling the teximage data back to system memory otherwise. Can anyone confirm or deny?


mode: 0 320ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 1 317ms [GL_BGRA/GL_UNSIGNED_BYTE] (OK)
mode: 2 377ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 3 375ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 4 376ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)
mode: 5 12ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)

0 Kudos
Doraisamy_G_Intel
1,052 Views
Hi mfah,

What do you mean the problem completely goes away?

From your first post, I take the problem as a performance differnence concern from older driver on 945 versus a newer driver on Intel 4 Series card.

Are you saying that if you used the GL_UNSIGNED_INT_8_8_8_8_REV / GL_BGRA, this performance difference is not seen anymore?

Thanks,
-Ganesh
0 Kudos
mfah
Novice
1,052 Views
Hi,

Yes, this is correct. The older part/older driver did not exhibit this performance problem, the newer one does. Running a VMWare session (via VMWare's SVGA driver) on the newer part/newer driver also does not exhibit this problem.

By switching the type and format to GL_UNSIGNED_INT_8_8_8_8_REV/GL_BGRA the performance problem is removed.
0 Kudos
Doraisamy_G_Intel
1,052 Views

Can you provide a test kernel? We will try to replicate the issue and try to provide a better explanation.

Thanks.

0 Kudos
mfah
Novice
1,053 Views
The following test app (SDL, mostly portable, compiles with MSVC 2008) can be used to test/verify this issue:
#define WINDOW_WIDTH    800
#define WINDOW_HEIGHT    600

#include "SDL.h"
#include "SDL_opengl.h"

#pragma comment (lib, "SDL.lib")
#pragma comment (lib, "SDLmain.lib")
#pragma comment (lib, "opengl32.lib")

#define TEX_WIDTH 512
#define TEX_HEIGHT 512

unsigned int sibuffer[TEX_WIDTH * TEX_HEIGHT];
unsigned int teximage = 0;
unsigned int framecount = 0;

// find the fastest modes to use for glTexSubImage2D
typedef struct tsitest_s
{
    char formatstr[64];
    char typestr[64];
    GLenum format;
    GLenum type;
    int modespeed;
    bool failed;
} tsitest_t;

tsitest_t tsimodes[] =
{
    {"GL_RGBA", "GL_UNSIGNED_BYTE", GL_RGBA, GL_UNSIGNED_BYTE, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_BYTE", GL_BGRA, GL_UNSIGNED_BYTE, 666, true},
    {"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
    {"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true},
    {"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true}
};


int fastest = 666;
int bestspeed = 32768;

GLuint R_MakeMeATexture (int width, int height, GLenum format, GLenum type)
{
    GLuint texnum = 0;

    glEnable (GL_TEXTURE_2D);
    glGenTextures (1, &texnum);
    glBindTexture (GL_TEXTURE_2D, texnum);
    glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D (GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, format, type, NULL);

    // commit the buffer so that timings are valid
    glFinish ();

    return texnum;
}


void R_SetTSIMode (void)
{
    int numtsimodes = sizeof (tsimodes) / sizeof (tsitest_t);

    for (int i = 0; i < numtsimodes; i++)
    {
        // clear last the error (if any)
        glGetError ();

        // create a new texture object
        GLuint texnum = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes.format, tsimodes.type);

        Uint32 start = SDL_GetTicks ();

        // SDL_GetTicks has insufficient resolution to measure one call so we need to run a few of them
        for (int t = 0; t < 16; t++)
            glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes.format, tsimodes.type, sibuffer);

        Uint32 end = SDL_GetTicks ();

        // commit the buffer so that timings are valid
        glFinish ();

        glDeleteTextures (1, &texnum);

        if (glGetError () != GL_NO_ERROR)
            tsimodes.failed = true;
        else tsimodes.failed = false;

        tsimodes.modespeed = (end - start);
    }

    for (int i = 0; i < numtsimodes; i++)
    {
        printf ("mode: %i %4ims [%s/%s] (%s)\n", i, tsimodes.modespeed, tsimodes.formatstr, 
            tsimodes.typestr, tsimodes.failed ? "FAILED" : "OK");

        if (tsimodes.modespeed <= bestspeed && !tsimodes.failed)
        {
            bestspeed = tsimodes.modespeed;
            fastest = i;
        }
    }

    if (fastest == 666)
    {
        MessageBox (NULL, "Failed to find a format!", "Error", MB_OK | MB_ICONSTOP);
        exit (0);
    }
}


void RenderOpenGL (void)
{
    framecount++;

    for (int i = 0, w = 0; w < TEX_WIDTH; w++)
    {
        for (int h = 0; h < TEX_HEIGHT; h++, i++)
        {
            unsigned char *rgba = (unsigned char *) &sibuffer;

            // 2 == red, 1 == green, 0 == blue
            rgba[2] = ((h * i) + framecount) & 255;
            rgba[1] = ((w * h) + framecount) & 255;
            rgba[0] = ((w * i) + framecount) & 255;
            rgba[3] = 255;
        }
    }

    glClear (GL_COLOR_BUFFER_BIT);

    glViewport (0, 0, WINDOW_WIDTH, WINDOW_HEIGHT);

    glMatrixMode (GL_MODELVIEW);
    glLoadIdentity ();

    glMatrixMode (GL_PROJECTION);
    glLoadIdentity ();
    glOrtho (0, WINDOW_WIDTH, WINDOW_HEIGHT, 0, -99999, 99999);

    glBindTexture (GL_TEXTURE_2D, teximage);
    glTexEnvi (GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);

    glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type, sibuffer);

    glBegin (GL_QUADS);

    glTexCoord2f (0, 0);
    glVertex2f (0, 0);

    glTexCoord2f (1, 0);
    glVertex2f (TEX_WIDTH, 0);

    glTexCoord2f (1, 1);
    glVertex2f (TEX_WIDTH, TEX_HEIGHT);

    glTexCoord2f (0, 1);
    glVertex2f (0, TEX_HEIGHT);

    glEnd ();
}


int main (int argc, char *argv[])
{
    if (SDL_Init (SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) != 0)
    {
        printf ("Unable to initialize SDL: %s\n", SDL_GetError ());
        return 1;
    }

    SDL_GL_SetAttribute (SDL_GL_DOUBLEBUFFER, 1);
    SDL_Surface *screen = SDL_SetVideoMode (WINDOW_WIDTH, WINDOW_HEIGHT, 32, SDL_OPENGL);

    R_SetTSIMode ();
    teximage = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type);
    glClearColor (0, 0, 0, 1);

    int done = 0;
    SDL_Event evt;

    while (!done)
    {
        while (!done && SDL_PollEvent (&evt))
        {
            if (evt.type == SDL_QUIT)
            {
                done = 1;
                break;
            }
        }

        // run the screen update here
        RenderOpenGL ();
        SDL_GL_SwapBuffers ();
    }

    return 0;
}
0 Kudos
Reply