Gl_array_buffer_arb
Yeah, so I implemented it and must be doing something wrong. My frames per second counter is reporting worse results using it than immediate mode. Shark reports better statistics, and OpenGL profiler isn't much help because the application runs like ass while under the watch of the profiler.
Here is the relevant code, its really long:
Here is the relevant code, its really long:
Code:
void VARParticle::Render()
{
float viewMatrix[16];
glGetFloatv(GL_MODELVIEW_MATRIX, viewMatrix);
vectorfloat right;
right.e[0] = viewMatrix[0];
right.e[1] = viewMatrix[4];
right.e[2] = viewMatrix[8];
vectorfloat up;
up.e[0] = viewMatrix[1];
up.e[1] = viewMatrix[5];
up.e[2] = viewMatrix[9];
vectorfloat rightPlusUp;
vectorfloat rightMinusUp;
vectorfloat upMinusRight;
vScopy(4, &right.v, &rightPlusUp.v);
vScopy(4, &right.v, &rightMinusUp.v);
vScopy(4, &up.v, &upMinusRight.v);
vSaxpy(4, 1.0f, &up.v, &rightPlusUp.v);
vSaxpy(4, -1.0f, &up.v, &rightMinusUp.v);
vSaxpy(4, -1.0f, &right.v, &upMinusRight.v);
#ifdef VBUFF
//Initialize the buffer mapping for this frame
//Bind it to this buffer..
glBindBufferARB(GL_ARRAY_BUFFER_ARB, bufferIdentifier);
glVertexPointer(4, GL_FLOAT, 3 * 4 * sizeof(GLfloat) , array_pointer);
glEnableClientState(GL_VERTEX_ARRAY);
glColorPointer(4, GL_FLOAT, 3 * 4 * sizeof(GLfloat), array_pointer + 4 * sizeof(GLfloat));
glEnableClientState(GL_COLOR_ARRAY);
glTexCoordPointer(2, GL_FLOAT, 3 * 4 * sizeof(GLfloat), array_pointer + 8 * sizeof(GLfloat));
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
//now map it
p = (float*)glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY);
//buffer should now be pointing to a block in memory large enough
//to fit all attributes that need to be drawn per particles
#endif
glDepthMask(GL_FALSE);
glEnable(GL_BLEND);
glBlendFunc(properties.sourceType, properties.destType);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, m_texture);
glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_MODULATE);
GLfloat size;
vectorfloat pos;
#ifndef VBUFF
glBegin(GL_QUADS);
#endif
for (int i = 0; i < m_numParticles; ++i)
{
size = m_particleList[i].m_size/2;
glColor4fv(m_particleList[i].m_color);
vScopy(4, &(m_particleList[i].m_pos.v), &pos.v);
vSaxpy(4, -size, &rightPlusUp.v , &pos.v);
#ifndef VBUFF
glTexCoord2f(0.0, 0.0);
glVertex3f(pos.e[0], pos.e[1], pos.e[2]);
#else
//Add vertex 1 to the buffer allocated
(*p++) = pos.e[0];
(*p++) = pos.e[1];
(*p++) = pos.e[2];
(*p++) = 1.0f;
(*p++) = m_particleList[i].m_color[0];
(*p++) = m_particleList[i].m_color[1];
(*p++) = m_particleList[i].m_color[2];
(*p++) = m_particleList[i].m_color[3];
(*p++) = 0.0f;
(*p++) = 0.0f;
++p;
++p;
#endif
vScopy(4, &(m_particleList[i].m_pos.v), &pos.v);
vSaxpy(4, size, &rightMinusUp.v, &pos.v);
#ifndef VBUFF
glTexCoord2f(1.0, 0.0);
glVertex3f(pos.e[0], pos.e[1], pos.e[2]);
#else
(*p++) = pos.e[0];
(*p++) = pos.e[1];
(*p++) = pos.e[2];
(*p++) = 1.0f;
(*p++) = m_particleList[i].m_color[0];
(*p++) = m_particleList[i].m_color[1];
(*p++) = m_particleList[i].m_color[2];
(*p++) = m_particleList[i].m_color[3];
(*p++) = 1.0f;
(*p++) = 0.0f;
++p;
++p;
#endif
vScopy(4, &(m_particleList[i].m_pos.v), &pos.v);
vSaxpy(4, size, &rightPlusUp.v, &pos.v);
#ifndef VBUFF
glTexCoord2f(1.0, 1.0);
glVertex3f(pos.e[0], pos.e[1], pos.e[2]);
#else
(*p++) = pos.e[0];
(*p++) = pos.e[1];
(*p++) = pos.e[2];
(*p++) = 1.0f;
(*p++) = m_particleList[i].m_color[0];
(*p++) = m_particleList[i].m_color[1];
(*p++) = m_particleList[i].m_color[2];
(*p++) = m_particleList[i].m_color[3];
(*p++) = 1.0f;
(*p++) = 1.0f;
++p;
++p;
#endif
vScopy(4, &(m_particleList[i].m_pos.v), &pos.v);
vSaxpy(4, size, &upMinusRight.v, &pos.v);
#ifndef VBUFF
glTexCoord2f(0.0, 1.0);
glVertex3f(pos.e[0], pos.e[1], pos.e[2]);
#else
(*p++) = pos.e[0];
(*p++) = pos.e[1];
(*p++) = pos.e[2];
(*p++) = 1.0f;
(*p++) = m_particleList[i].m_color[0];
(*p++) = m_particleList[i].m_color[1];
(*p++) = m_particleList[i].m_color[2];
(*p++) = m_particleList[i].m_color[3];
(*p++) = 0.0f;
(*p++) = 1.0f;
++p;
++p;
#endif
((VARParticle*)this->*trailFunction)(&(m_particleList[i]), up, right);
}
#ifndef VBUFF
glEnd();
#else
//Now unmap the buffer
if(glUnmapBufferARB(GL_ARRAY_BUFFER_ARB))
{//Now draw the arrays in it
int toDraw = 4*m_numParticles;
if(properties.trailCount > 0)
toDraw = 4*m_numParticles*properties.trailCount + 4*m_numParticles;
glDrawArrays(GL_QUADS, 0, toDraw);
}
//unbind the array buffer
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
#endif
glDisable(GL_TEXTURE_2D);
glDisable(GL_BLEND);
glDepthMask(GL_TRUE);
}
What is the usage mode you set when you made your initial glBufferDataARB call?
If you're respecifying the vertex data every frame, you may well find you need to double-buffer your VBOs to get decent performance.
If GL profiler is having a substantially adverse effect on your framerate, something's wrong. Maybe uncheck the "collect trace" box on the main window.
If you're respecifying the vertex data every frame, you may well find you need to double-buffer your VBOs to get decent performance.
If GL profiler is having a substantially adverse effect on your framerate, something's wrong. Maybe uncheck the "collect trace" box on the main window.
The set up code:
I'll try with the collect trace off next time, but I need breakfast first.
Code:
//Initialize ARB
#ifdef VBUFF
glGenBuffersARB(1, &bufferIdentifier);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, bufferIdentifier);
// number of particles * number of types * indices * number of coords per * size of the coord
int bufferLength = (m_maxParticles * (properties.trailCount + 1)) * 3 * 4 * 4 * sizeof(GLfloat);
glBufferDataARB(GL_ARRAY_BUFFER_ARB, bufferLength, NULL, GL_STREAM_DRAW_ARB);
#endifI'll try with the collect trace off next time, but I need breakfast first.
I set up some code to write out to a file how long rendering each frame took. Here are the (abbreviated) results of rendering in immediate mode.
And here they are using ARB
Not only is it slower, it's twice as slow. I'm starting to think that maybe vertex buffers aren't meant for dynamic data? Because that is slow.
Code:
Sample: 0.000172
Sample: 0.000165
Sample: 0.000117
Sample: 0.000123
Sample: 0.000128
Sample: 0.000151
Sample: 0.000069
Sample: 0.000072
Sample: 0.000082
Sample: 0.000097
Sample: 0.000095
Sample: 0.000140And here they are using ARB
Code:
Sample: 0.000450
Sample: 0.000375
Sample: 0.000341
Sample: 0.000349
Sample: 0.000335
Sample: 0.000444
Sample: 0.000357
Sample: 0.000357
Sample: 0.000338
Sample: 0.000350
Sample: 0.000338
Sample: 0.000349
Sample: 0.000348
Sample: 0.000345Not only is it slower, it's twice as slow. I'm starting to think that maybe vertex buffers aren't meant for dynamic data? Because that is slow.
For dynamic data, I found a slight speedup using VBOs, but not until I'd double-buffered them. Before that, they were slightly slower.
How do you double-buffer VBOs? Is is a matter of preparing the VBO on a separate thread while the first is executing? Or are you just oscillating between which thread is being written to and which is drawn?
Have two VBOs, switch which one you use each frame.
I implemented double buffering, it wasn't any faster. I'll just stick to immediate mode, with the accelerate optimizations I was just running 46,000 particles(not that you'd ever need that many) at 40 FPS on my dual 2ghz G5 with radeon 9800 Pro. That's good enough, in my opinion.

