OpenGL - 几何着色器阴影映射传递非常糟糕
OpenGL - Geometry shader shadow mapping pass performing terribly
我正在使用方差阴影贴图计算许多点光源的阴影。立方体贴图的所有 6 个面都使用几何着色器在单通道中渲染,对每个光源重复此过程,并且整个批次存储在立方体贴图数组中。这一切都运行良好,60fps 的 16 盏灯没问题。
为了进一步优化,我尝试将整个过程移动到单个几何着色器通道,结果却达到了我硬件仅有的 113 个顶点输出限制。出于好奇,我决定只渲染 4 盏灯(72 个发射的顶点),令我惊讶的是它下降到 24fps。
那么,为什么 16 盏灯和 16 次渲染通道的性能明显优于单次渲染 4 盏灯?
代码基本相同。
#version 400 core
layout(triangles) in;
layout (triangle_strip, max_vertices=18) out;
uniform int lightID;
out vec4 frag_position;
uniform mat4 projectionMatrix;
uniform mat4 shadowTransforms[6];
void main()
{
for(int face = 0; face < 6; face++)
{
gl_Layer = face + (lightID * 6);
for(int i=0; i<3; i++)
{
frag_position = shadowTransforms[face] * gl_in[i].gl_Position;
gl_Position = projectionMatrix * shadowTransforms[face] * gl_in[i].gl_Position;
EmitVertex();
}
EndPrimitive();
}
}
对比
#version 400 core
layout(triangles) in;
layout (triangle_strip, max_vertices=72) out;
out vec4 frag_position;
uniform mat4 projectionMatrix;
uniform mat4 shadowTransforms[24];
void main()
{
for (int lightSource = 0; lightSource < 4; lightSource++)
{
for(int face = 0; face < 6; face++)
{
gl_Layer = face + (lightSource * 6);
for(int i=0; i<3; i++)
{
frag_position = shadowTransforms[gl_Layer] * gl_in[i].gl_Position;
gl_Position = projectionMatrix * shadowTransforms[gl_Layer] * gl_in[i].gl_Position;
EmitVertex();
}
EndPrimitive();
}
}
}
和
public void ShadowMapsPass(Shader shader)
{
// Setup
GL.UseProgram(shader.ID);
GL.Viewport(0, 0, CubeMapArray.size, CubeMapArray.size);
// Clear the cubemarray array data from the previous frame
GL.BindFramebuffer(FramebufferTarget.Framebuffer, shadowMapArray.FBO_handle);
GL.ClearColor(Color.White);
GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
for (int j = 0; j < lights.Count; j++)
{
// Create the light's view matrices
List<Matrix4> shadowTransforms = new List<Matrix4>();
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(-1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 1, 0), new Vector3(0, 0, 1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, -1, 0), new Vector3(0, 0, -1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, 1), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, -1), new Vector3(0, -1, 0)));
// Send uniforms to the shader
for (int i = 0; i < 6; i++)
{
Matrix4 shadowTransform = shadowTransforms[i];
GL.UniformMatrix4(shader.getUniformID("shadowTransforms[" + i + "]"), false, ref shadowTransform);
}
GL.Uniform1(shader.getUniformID("lightID"), j);
DrawScene(shader, false);
}
}
对比
public void ShadowMapsPass(Shader shader)
{
// Setup
GL.UseProgram(shader.ID);
GL.Viewport(0, 0, CubeMapArray.size, CubeMapArray.size);
// Clear the cubemarray array data from the previous frame
GL.BindFramebuffer(FramebufferTarget.Framebuffer, shadowMapArray.FBO_handle);
GL.ClearColor(Color.White);
GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
// Create the light's view matrices
List<Matrix4> shadowTransforms = new List<Matrix4>();
for (int j = 0; j < lights.Count; j++)
{
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(-1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 1, 0), new Vector3(0, 0, 1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, -1, 0), new Vector3(0, 0, -1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, 1), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, -1), new Vector3(0, -1, 0)));
}
// Send uniforms to the shader
for (int i = 0; i < shadowTransforms.Count; i++)
{
Matrix4 shadowTransform = shadowTransforms[i];
GL.UniformMatrix4(shader.getUniformID("shadowTransforms[" + i + "]"), false, ref shadowTransform);
}
DrawScene(shader, false);
}
我猜第二种形式的并行代码执行机会更少。第一个版本的几何着色器生成 18 个顶点,必须执行 4 次,但这 4 次执行可以 运行 并行。第二个版本依次生成72个顶点。
我正在使用方差阴影贴图计算许多点光源的阴影。立方体贴图的所有 6 个面都使用几何着色器在单通道中渲染,对每个光源重复此过程,并且整个批次存储在立方体贴图数组中。这一切都运行良好,60fps 的 16 盏灯没问题。
为了进一步优化,我尝试将整个过程移动到单个几何着色器通道,结果却达到了我硬件仅有的 113 个顶点输出限制。出于好奇,我决定只渲染 4 盏灯(72 个发射的顶点),令我惊讶的是它下降到 24fps。
那么,为什么 16 盏灯和 16 次渲染通道的性能明显优于单次渲染 4 盏灯?
代码基本相同。
#version 400 core
layout(triangles) in;
layout (triangle_strip, max_vertices=18) out;
uniform int lightID;
out vec4 frag_position;
uniform mat4 projectionMatrix;
uniform mat4 shadowTransforms[6];
void main()
{
for(int face = 0; face < 6; face++)
{
gl_Layer = face + (lightID * 6);
for(int i=0; i<3; i++)
{
frag_position = shadowTransforms[face] * gl_in[i].gl_Position;
gl_Position = projectionMatrix * shadowTransforms[face] * gl_in[i].gl_Position;
EmitVertex();
}
EndPrimitive();
}
}
对比
#version 400 core
layout(triangles) in;
layout (triangle_strip, max_vertices=72) out;
out vec4 frag_position;
uniform mat4 projectionMatrix;
uniform mat4 shadowTransforms[24];
void main()
{
for (int lightSource = 0; lightSource < 4; lightSource++)
{
for(int face = 0; face < 6; face++)
{
gl_Layer = face + (lightSource * 6);
for(int i=0; i<3; i++)
{
frag_position = shadowTransforms[gl_Layer] * gl_in[i].gl_Position;
gl_Position = projectionMatrix * shadowTransforms[gl_Layer] * gl_in[i].gl_Position;
EmitVertex();
}
EndPrimitive();
}
}
}
和
public void ShadowMapsPass(Shader shader)
{
// Setup
GL.UseProgram(shader.ID);
GL.Viewport(0, 0, CubeMapArray.size, CubeMapArray.size);
// Clear the cubemarray array data from the previous frame
GL.BindFramebuffer(FramebufferTarget.Framebuffer, shadowMapArray.FBO_handle);
GL.ClearColor(Color.White);
GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
for (int j = 0; j < lights.Count; j++)
{
// Create the light's view matrices
List<Matrix4> shadowTransforms = new List<Matrix4>();
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(-1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 1, 0), new Vector3(0, 0, 1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, -1, 0), new Vector3(0, 0, -1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, 1), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, -1), new Vector3(0, -1, 0)));
// Send uniforms to the shader
for (int i = 0; i < 6; i++)
{
Matrix4 shadowTransform = shadowTransforms[i];
GL.UniformMatrix4(shader.getUniformID("shadowTransforms[" + i + "]"), false, ref shadowTransform);
}
GL.Uniform1(shader.getUniformID("lightID"), j);
DrawScene(shader, false);
}
}
对比
public void ShadowMapsPass(Shader shader)
{
// Setup
GL.UseProgram(shader.ID);
GL.Viewport(0, 0, CubeMapArray.size, CubeMapArray.size);
// Clear the cubemarray array data from the previous frame
GL.BindFramebuffer(FramebufferTarget.Framebuffer, shadowMapArray.FBO_handle);
GL.ClearColor(Color.White);
GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
// Create the light's view matrices
List<Matrix4> shadowTransforms = new List<Matrix4>();
for (int j = 0; j < lights.Count; j++)
{
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(-1, 0, 0), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 1, 0), new Vector3(0, 0, 1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, -1, 0), new Vector3(0, 0, -1)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, 1), new Vector3(0, -1, 0)));
shadowTransforms.Add(Matrix4.LookAt(lights[j].position, lights[j].position + new Vector3(0, 0, -1), new Vector3(0, -1, 0)));
}
// Send uniforms to the shader
for (int i = 0; i < shadowTransforms.Count; i++)
{
Matrix4 shadowTransform = shadowTransforms[i];
GL.UniformMatrix4(shader.getUniformID("shadowTransforms[" + i + "]"), false, ref shadowTransform);
}
DrawScene(shader, false);
}
我猜第二种形式的并行代码执行机会更少。第一个版本的几何着色器生成 18 个顶点,必须执行 4 次,但这 4 次执行可以 运行 并行。第二个版本依次生成72个顶点。