Merge pull request #16005 from unknownbrackets/softgpu-fog

softgpu: Correct accuracy of fog calculation
hrydgard · Sep 11, 2022 · 46500c8 · 46500c8
2 parents 35c9cbd + 8c55e18
commit 46500c8
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 94 deletions.
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
@@ -228,11 +228,11 @@ class Vec3
 
 	Vec3() {}
 	Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
-	Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
+	constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
 	Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
 #if defined(_M_SSE)
-	Vec3(const __m128 &_vec) : vec(_vec) {}
-	Vec3(const __m128i &_ivec) : ivec(_ivec) {}
+	constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
+	constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
 	Vec3(const Vec3Packed<T> &_xyz) {
 		vec = _mm_loadu_ps(_xyz.AsArray());
 	}
@@ -249,7 +249,7 @@ class Vec3
 #endif
 
 	template<typename T2>
-	Vec3<T2> Cast() const
+	constexpr Vec3<T2> Cast() const
 	{
 		return Vec3<T2>((T2)x, (T2)y, (T2)z);
 	}
@@ -258,7 +258,7 @@ class Vec3
 	static Vec3 FromRGB(unsigned int rgb);
 	unsigned int ToRGB() const; // alpha bits set to zero
 
-	static Vec3 AssignToAll(const T& f)
+	static constexpr Vec3 AssignToAll(const T& f)
 	{
 		return Vec3<T>(f, f, f);
 	}

diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp
@@ -415,7 +415,9 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo
 	// Fog is applied prior to color test.
 	if (pixelID.applyFog && !clearMode) {
 		Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);
-		fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog)) / 255;
+		// This is very similar to the BLEND texfunc, and simply always rounds up.
+		static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);
+		fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;
 		prim_color.r() = fogColor.r();
 		prim_color.g() = fogColor.g();
 		prim_color.b() = fogColor.b();
@@ -548,8 +550,6 @@ void PixelJitCache::Clear() {
 
 	constBlendHalf_11_4s_ = nullptr;
 	constBlendInvert_11_4s_ = nullptr;
-	const255_16s_ = nullptr;
-	constBy255i_ = nullptr;
 }
 
 std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {

diff --git a/GPU/Software/DrawPixel.h b/GPU/Software/DrawPixel.h
@@ -106,8 +106,6 @@ class PixelJitCache : public Rasterizer::CodeBlock {
 
 	const u8 *constBlendHalf_11_4s_ = nullptr;
 	const u8 *constBlendInvert_11_4s_ = nullptr;
-	const u8 *const255_16s_ = nullptr;
-	const u8 *constBy255i_ = nullptr;
 
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 	void Discard();

diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp
@@ -336,12 +336,6 @@ void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
 
 	// This is used for shifted blend factors, to inverse them.
 	WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);
-
-	// A set of 255s, used to inverse fog.
-	WriteSimpleConst8x16(const255_16s_, 0xFF);
-
-	// This is used for a multiply that divides by 255 with shifting.
-	WriteSimpleConst8x16(constBy255i_, 0x8081);
 }
 
 bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
@@ -535,7 +529,8 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
 
 	// Load a set of 255s at 16 bit into a reg for later...
 	X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
-	MOVDQA(invertReg, M(const255_16s_));
+	PCMPEQW(invertReg, R(invertReg));
+	PSRLW(invertReg, 8);
 
 	// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
 	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
@@ -568,21 +563,24 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
 	// We can free up the actual fog reg now.
 	regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
 
+	// Our goal here is to calculate this formula:
+	// (argColor * fog + fogColor * (255 - fog) + 255) / 256
+
 	// Now we multiply the existing color by fog...
 	PMULLW(argColorReg, R(fogMultReg));
-	// And then inverse the fog value using those 255s we loaded, and multiply by fog color.
-	PSUBUSW(invertReg, R(fogMultReg));
+	// Before inversing, let's add that 255 we loaded in as well, since we have it.
+	PADDW(argColorReg, R(invertReg));
+	// And then inverse the fog value using those 255s, and multiply by fog color.
+	PSUBW(invertReg, R(fogMultReg));
 	PMULLW(fogColorReg, R(invertReg));
 	// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
-	PADDUSW(argColorReg, R(fogColorReg));
+	PADDW(argColorReg, R(fogColorReg));
 	regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
 	regCache_.Release(invertReg, RegCache::VEC_TEMP2);
 	regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);
 
-	// Now to divide by 255, we use bit tricks: multiply by 0x8081, and shift right by 16+7.
-	PMULHUW(argColorReg, M(constBy255i_));
-	// Now shift right by 7 (PMULHUW already did 16 of the shift.)
-	PSRLW(argColorReg, 7);
+	// Now we simply divide by 256, or in other words shift by 8.
+	PSRLW(argColorReg, 8);
 
 	// Okay, put A back in, we'll shrink it to 8888 when needed.
 	PINSRW(argColorReg, R(alphaReg), 3);

diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
@@ -348,7 +348,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
 
 		return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
 #else
-		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
 		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
 		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
 		return lhs + rhs;
@@ -370,7 +370,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
 
 		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
 #else
-		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
 		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
 		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
 		return lhs - rhs;
@@ -392,7 +392,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
 
 		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
 #else
-		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
 		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
 		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
 		return rhs - lhs;

diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp
@@ -267,8 +267,6 @@ static inline bool NoClampOrWrap(const RasterizerState &state, const Vec2f &tc)
 		return false;
 	if (state.samplerID.cached.sizes[0].w > 512 || state.samplerID.cached.sizes[0].h > 512)
 		return false;
-	if (!state.throughMode)
-		return tc.x <= 1.0f && tc.y <= 1.0f;
 	return tc.x <= state.samplerID.cached.sizes[0].w && tc.y <= state.samplerID.cached.sizes[0].h;
 }
 
@@ -288,7 +286,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
 	// Currently only works for TL/BR, which is the most common but not required.
 	bool orient_check = xdiff >= 0 && ydiff >= 0;
 	// We already have a fast path for clear in ClearRectangle.
-	bool state_check = !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords);
+	bool state_check = state.throughMode && !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords);
 	// This doesn't work well with offset drawing, see #15876.  Through never has a subpixel offset.
 	bool subpixel_check = ((v0.screenpos.x | v0.screenpos.y | v1.screenpos.x | v1.screenpos.y) & 0xF) == 0;
 	if ((coord_check || !state.enableTextures) && orient_check && state_check && subpixel_check) {