Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize lighting for softgpu a bit #17295

Merged
merged 5 commits into from
Apr 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions GPU/Software/BinManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,9 @@ void BinManager::UpdateClut(const void *src) {
PROFILE_THIS_SCOPE("bin_clut");
if (cluts_.Full())
Flush("cluts");
clutIndex_ = (uint16_t)cluts_.Push(BinClut());
memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut));
BinClut &clut = cluts_.PeekPush();
memcpy(clut.readable, src, sizeof(BinClut));
clutIndex_ = (uint16_t)cluts_.PushPeeked();
}

void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2) {
Expand Down
3 changes: 2 additions & 1 deletion GPU/Software/BinManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,12 @@ struct BinQueue {
return items_[tail_];
}

void PushPeeked() {
size_t PushPeeked() {
size_t i = tail_++;
if (i + 1 == N)
tail_ -= N;
size_++;
return i;
}

size_t Size() const {
Expand Down
92 changes: 77 additions & 15 deletions GPU/Software/Lighting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ void ComputeState(State *state, bool hasColor0) {
bool anyAmbient = false;
bool anyDiffuse = false;
bool anySpecular = false;
bool anyDirectional = false;
for (int light = 0; light < 4; ++light) {
auto &lstate = state->lights[light];
lstate.enabled = gstate.isLightChanEnabled(light);
Expand All @@ -112,10 +113,12 @@ void ComputeState(State *state, bool hasColor0) {
}

lstate.pos = GetLightVec(gstate.lpos, light);
if (lstate.directional)
if (lstate.directional) {
lstate.pos.NormalizeOr001();
else
anyDirectional = true;
} else {
lstate.att = GetLightVec(gstate.latt, light);
}

if (lstate.spot) {
lstate.spotDir = GetLightVec(gstate.ldir, light);
Expand Down Expand Up @@ -174,6 +177,8 @@ void ComputeState(State *state, bool hasColor0) {
state->baseAmbientColorFactor = LightColorFactor(gstate.getAmbientRGBA(), ones);
state->setColor1 = gstate.isUsingSecondaryColor() && anySpecular;
state->addColor1 = !gstate.isUsingSecondaryColor() && anySpecular;
state->usesWorldPos = anyDirectional;
state->usesWorldNormal = gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP || anyDiffuse || anySpecular;
}

static inline float GenerateLightCoord(VertexData &vertex, const WorldCoords &worldnormal, int light) {
Expand All @@ -192,6 +197,62 @@ void GenerateLightST(VertexData &vertex, const WorldCoords &worldnormal) {
vertex.texturecoords.t() = GenerateLightCoord(vertex, worldnormal, gstate.getUVLS1());
}

#if defined(_M_SSE)
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
[[gnu::target("sse4.1")]]
#endif
static inline int LightCeilSSE4(float f) {
__m128 v = _mm_set_ss(f);
// This isn't terribly fast, but seems to be better than calling ceilf().
return _mm_cvt_ss2si(_mm_ceil_ss(v, v));
}

#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
[[gnu::target("sse4.1")]]
#endif
static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __m128i scale) {
// We can use 16-bit multiply here (faster than 32-bit multiply) since our top bits are zero.
__m128i result18 = _mm_madd_epi16(factor, color);
// But now with 18 bits, we need a full multiply.
__m128i multiplied = _mm_mullo_epi32(result18, scale);
return _mm_srai_epi32(multiplied, 19);
}
#endif

static inline int LightCeil(float f) {
#if defined(_M_SSE)
if (cpu_info.bSSE4_1)
return LightCeilSSE4(f);
#elif PPSSPP_ARCH(ARM64_NEON)
return vcvtps_s32_f32(f);
#endif
return (int)ceilf(f);
}

static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &color, int scale) {
// We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit.
// The reason all factors are s9 is to account for rounding.
// Also note that all values are positive, so can be treated as unsigned.
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
if (cpu_info.bSSE4_1)
return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));
#elif PPSSPP_ARCH(ARM64_NEON)
int32x4_t multiplied = vmulq_n_s32(vmulq_s32(factor.ivec, color.ivec), scale);
return vshrq_n_s32(multiplied, 19);
#endif
return (factor * color * scale) / (1024 * 512);
}

static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
sum.ivec = _mm_add_epi32(sum.ivec, src.ivec);
#elif PPSSPP_ARCH(ARM64_NEON)
sum.ivec = vaddq_s32(sum.ivec, src.ivec);
#else
sum += src;
#endif
}

void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
// Lighting blending rounds using the half offset method (like alpha blend.)
const Vec4<int> ones = Vec4<int>::AssignToAll(1);
Expand Down Expand Up @@ -245,11 +306,11 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords

// ambient lighting
if (lstate.ambient) {
int attspot = (int)ceilf(256 * 2 * att * spot + 1);
int attspot = (int)LightCeil(256 * 2 * att * spot + 1);
if (attspot > 512)
attspot = 512;
Vec4<int> lambient = (mac * lstate.ambientColorFactor * attspot) / (1024 * 512);
final_color += lambient;
Vec4<int> lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot);
LightColorSum(final_color, lambient);
}

// diffuse lighting
Expand All @@ -262,12 +323,12 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
}

if (lstate.diffuse && diffuse_factor > 0.0f) {
int diffuse_attspot = (int)ceilf(256 * 2 * att * spot * diffuse_factor + 1);
int diffuse_attspot = (int)LightCeil(256 * 2 * att * spot * diffuse_factor + 1);
if (diffuse_attspot > 512)
diffuse_attspot = 512;
Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;
Vec4<int> ldiffuse = (lstate.diffuseColorFactor * mdc * diffuse_attspot) / (1024 * 512);
final_color += ldiffuse;
Vec4<int> ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot);
LightColorSum(final_color, ldiffuse);
}

if (lstate.specular && diffuse_factor >= 0.0f) {
Expand All @@ -277,24 +338,25 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
specular_factor = pspLightPow(specular_factor, state.specularExp);

if (specular_factor > 0.0f) {
int specular_attspot = (int)ceilf(256 * 2 * att * spot * specular_factor + 1);
int specular_attspot = (int)LightCeil(256 * 2 * att * spot * specular_factor + 1);
if (specular_attspot > 512)
specular_attspot = 512;

Vec4<int> msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor;
Vec4<int> lspecular = (lstate.specularColorFactor * msc * specular_attspot) / (1024 * 512);
specular_color += lspecular;
Vec4<int> lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot);
LightColorSum(specular_color, lspecular);
}
}
}

// Note: these are all naturally clamped by ToRGBA/toRGB.
if (state.setColor1) {
vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
vertex.color1 = specular_color.Clamp(0, 255).rgb().ToRGB();
vertex.color0 = final_color.ToRGBA();
vertex.color1 = specular_color.rgb().ToRGB();
} else if (state.addColor1) {
vertex.color0 = (final_color + specular_color).Clamp(0, 255).ToRGBA();
vertex.color0 = (final_color + specular_color).ToRGBA();
} else {
vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
vertex.color0 = final_color.ToRGBA();
}
}

Expand Down
2 changes: 2 additions & 0 deletions GPU/Software/Lighting.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ struct State {
bool colorForSpecular : 1;
bool setColor1 : 1;
bool addColor1 : 1;
bool usesWorldPos : 1;
bool usesWorldNormal : 1;
};
};

Expand Down
11 changes: 4 additions & 7 deletions GPU/Software/TransformUnit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,12 +273,9 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
bool canSkipWorldPos = true;
if (state->enableLighting) {
Lighting::ComputeState(&state->lightingState, vreader.hasColor0());
for (int i = 0; i < 4; ++i) {
if (!state->lightingState.lights[i].enabled)
continue;
if (!state->lightingState.lights[i].directional)
canSkipWorldPos = false;
}
canSkipWorldPos = !state->lightingState.usesWorldPos;
} else {
state->lightingState.usesWorldNormal = state->uvGenMode == GE_TEXMAP_ENVIRONMENT_MAP;
}

float world[16];
Expand Down Expand Up @@ -412,7 +409,7 @@ ClipVertexData TransformUnit::ReadVertex(const VertexReader &vreader, const Tran
vertex.v.clipw = vertex.clippos.w;

Vec3<float> worldnormal;
if (state.enableLighting || state.uvGenMode == GE_TEXMAP_ENVIRONMENT_MAP) {
if (state.lightingState.usesWorldNormal) {
worldnormal = TransformUnit::ModelToWorldNormal(normal);
worldnormal.NormalizeOr001();
}
Expand Down