Skip to content

Commit

Permalink
Merge pull request #16005 from unknownbrackets/softgpu-fog
Browse files Browse the repository at this point in the history
softgpu: Correct accuracy of fog calculation
  • Loading branch information
hrydgard committed Sep 11, 2022
2 parents 35c9cbd + 8c55e18 commit 46500c8
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 94 deletions.
10 changes: 5 additions & 5 deletions GPU/Math3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,11 @@ class Vec3

Vec3() {}
Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
#if defined(_M_SSE)
Vec3(const __m128 &_vec) : vec(_vec) {}
Vec3(const __m128i &_ivec) : ivec(_ivec) {}
constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
Vec3(const Vec3Packed<T> &_xyz) {
vec = _mm_loadu_ps(_xyz.AsArray());
}
Expand All @@ -249,7 +249,7 @@ class Vec3
#endif

template<typename T2>
Vec3<T2> Cast() const
constexpr Vec3<T2> Cast() const
{
return Vec3<T2>((T2)x, (T2)y, (T2)z);
}
Expand All @@ -258,7 +258,7 @@ class Vec3
static Vec3 FromRGB(unsigned int rgb);
unsigned int ToRGB() const; // alpha bits set to zero

static Vec3 AssignToAll(const T& f)
static constexpr Vec3 AssignToAll(const T& f)
{
return Vec3<T>(f, f, f);
}
Expand Down
6 changes: 3 additions & 3 deletions GPU/Software/DrawPixel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,9 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo
// Fog is applied prior to color test.
if (pixelID.applyFog && !clearMode) {
Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);
fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog)) / 255;
// This is very similar to the BLEND texfunc, and simply always rounds up.
static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);
fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;
prim_color.r() = fogColor.r();
prim_color.g() = fogColor.g();
prim_color.b() = fogColor.b();
Expand Down Expand Up @@ -548,8 +550,6 @@ void PixelJitCache::Clear() {

constBlendHalf_11_4s_ = nullptr;
constBlendInvert_11_4s_ = nullptr;
const255_16s_ = nullptr;
constBy255i_ = nullptr;
}

std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {
Expand Down
2 changes: 0 additions & 2 deletions GPU/Software/DrawPixel.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ class PixelJitCache : public Rasterizer::CodeBlock {

const u8 *constBlendHalf_11_4s_ = nullptr;
const u8 *constBlendInvert_11_4s_ = nullptr;
const u8 *const255_16s_ = nullptr;
const u8 *constBy255i_ = nullptr;

#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
void Discard();
Expand Down
26 changes: 12 additions & 14 deletions GPU/Software/DrawPixelX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,6 @@ void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {

// This is used for shifted blend factors, to inverse them.
WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);

// A set of 255s, used to inverse fog.
WriteSimpleConst8x16(const255_16s_, 0xFF);

// This is used for a multiply that divides by 255 with shifting.
WriteSimpleConst8x16(constBy255i_, 0x8081);
}

bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
Expand Down Expand Up @@ -535,7 +529,8 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {

// Load a set of 255s at 16 bit into a reg for later...
X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
MOVDQA(invertReg, M(const255_16s_));
PCMPEQW(invertReg, R(invertReg));
PSRLW(invertReg, 8);

// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
Expand Down Expand Up @@ -568,21 +563,24 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
// We can free up the actual fog reg now.
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);

// Our goal here is to calculate this formula:
// (argColor * fog + fogColor * (255 - fog) + 255) / 256

// Now we multiply the existing color by fog...
PMULLW(argColorReg, R(fogMultReg));
// And then inverse the fog value using those 255s we loaded, and multiply by fog color.
PSUBUSW(invertReg, R(fogMultReg));
// Before inversing, let's add that 255 we loaded in as well, since we have it.
PADDW(argColorReg, R(invertReg));
// And then inverse the fog value using those 255s, and multiply by fog color.
PSUBW(invertReg, R(fogMultReg));
PMULLW(fogColorReg, R(invertReg));
// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
PADDUSW(argColorReg, R(fogColorReg));
PADDW(argColorReg, R(fogColorReg));
regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
regCache_.Release(invertReg, RegCache::VEC_TEMP2);
regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);

// Now to divide by 255, we use bit tricks: multiply by 0x8081, and shift right by 16+7.
PMULHUW(argColorReg, M(constBy255i_));
// Now shift right by 7 (PMULHUW already did 16 of the shift.)
PSRLW(argColorReg, 7);
// Now we simply divide by 256, or in other words shift by 8.
PSRLW(argColorReg, 8);

// Okay, put A back in, we'll shrink it to 8888 when needed.
PINSRW(argColorReg, R(alphaReg), 3);
Expand Down
6 changes: 3 additions & 3 deletions GPU/Software/Rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc

return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
#else
Vec3<int> half = Vec3<int>::AssignToAll(1);
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs + rhs;
Expand All @@ -370,7 +370,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
#else
Vec3<int> half = Vec3<int>::AssignToAll(1);
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs - rhs;
Expand All @@ -392,7 +392,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
#else
Vec3<int> half = Vec3<int>::AssignToAll(1);
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return rhs - lhs;
Expand Down
4 changes: 1 addition & 3 deletions GPU/Software/RasterizerRectangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,6 @@ static inline bool NoClampOrWrap(const RasterizerState &state, const Vec2f &tc)
return false;
if (state.samplerID.cached.sizes[0].w > 512 || state.samplerID.cached.sizes[0].h > 512)
return false;
if (!state.throughMode)
return tc.x <= 1.0f && tc.y <= 1.0f;
return tc.x <= state.samplerID.cached.sizes[0].w && tc.y <= state.samplerID.cached.sizes[0].h;
}

Expand All @@ -288,7 +286,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
// Currently only works for TL/BR, which is the most common but not required.
bool orient_check = xdiff >= 0 && ydiff >= 0;
// We already have a fast path for clear in ClearRectangle.
bool state_check = !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords);
bool state_check = state.throughMode && !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords);
// This doesn't work well with offset drawing, see #15876. Through never has a subpixel offset.
bool subpixel_check = ((v0.screenpos.x | v0.screenpos.y | v1.screenpos.x | v1.screenpos.y) & 0xF) == 0;
if ((coord_check || !state.enableTextures) && orient_check && state_check && subpixel_check) {
Expand Down
Loading

0 comments on commit 46500c8

Please sign in to comment.