diff --git a/GPU/Math3D.h b/GPU/Math3D.h index dd1991f91772..a87cce83184b 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -228,11 +228,11 @@ class Vec3 Vec3() {} Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} - Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} + constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} Vec3(const Vec2& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {} #if defined(_M_SSE) - Vec3(const __m128 &_vec) : vec(_vec) {} - Vec3(const __m128i &_ivec) : ivec(_ivec) {} + constexpr Vec3(const __m128 &_vec) : vec(_vec) {} + constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {} Vec3(const Vec3Packed &_xyz) { vec = _mm_loadu_ps(_xyz.AsArray()); } @@ -249,7 +249,7 @@ class Vec3 #endif template - Vec3 Cast() const + constexpr Vec3 Cast() const { return Vec3((T2)x, (T2)y, (T2)z); } @@ -258,7 +258,7 @@ class Vec3 static Vec3 FromRGB(unsigned int rgb); unsigned int ToRGB() const; // alpha bits set to zero - static Vec3 AssignToAll(const T& f) + static constexpr Vec3 AssignToAll(const T& f) { return Vec3(f, f, f); } diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp index ab244aacf812..ee47c27b9430 100644 --- a/GPU/Software/DrawPixel.cpp +++ b/GPU/Software/DrawPixel.cpp @@ -415,7 +415,9 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo // Fog is applied prior to color test. if (pixelID.applyFog && !clearMode) { Vec3 fogColor = Vec3::FromRGB(pixelID.cached.fogColor); - fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog)) / 255; + // This is very similar to the BLEND texfunc, and simply always rounds up. + static constexpr Vec3 roundup = Vec3::AssignToAll(255); + fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256; prim_color.r() = fogColor.r(); prim_color.g() = fogColor.g(); prim_color.b() = fogColor.b(); @@ -548,8 +550,6 @@ void PixelJitCache::Clear() { constBlendHalf_11_4s_ = nullptr; constBlendInvert_11_4s_ = nullptr; - const255_16s_ = nullptr; - constBy255i_ = nullptr; } std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) { diff --git a/GPU/Software/DrawPixel.h b/GPU/Software/DrawPixel.h index 982ee2776531..f813bf61c1f0 100644 --- a/GPU/Software/DrawPixel.h +++ b/GPU/Software/DrawPixel.h @@ -106,8 +106,6 @@ class PixelJitCache : public Rasterizer::CodeBlock { const u8 *constBlendHalf_11_4s_ = nullptr; const u8 *constBlendInvert_11_4s_ = nullptr; - const u8 *const255_16s_ = nullptr; - const u8 *constBy255i_ = nullptr; #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) void Discard(); diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp index 8f006bece4fd..dc53be1c6219 100644 --- a/GPU/Software/DrawPixelX86.cpp +++ b/GPU/Software/DrawPixelX86.cpp @@ -336,12 +336,6 @@ void PixelJitCache::WriteConstantPool(const PixelFuncID &id) { // This is used for shifted blend factors, to inverse them. WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4); - - // A set of 255s, used to inverse fog. - WriteSimpleConst8x16(const255_16s_, 0xFF); - - // This is used for a multiply that divides by 255 with shifting. - WriteSimpleConst8x16(constBy255i_, 0x8081); } bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) { @@ -535,7 +529,8 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { // Load a set of 255s at 16 bit into a reg for later... X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2); - MOVDQA(invertReg, M(const255_16s_)); + PCMPEQW(invertReg, R(invertReg)); + PSRLW(invertReg, 8); // Expand (we clamped) color to 16 bit as well, so we can multiply with fog. X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR); @@ -568,21 +563,24 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { // We can free up the actual fog reg now. regCache_.ForceRelease(RegCache::GEN_ARG_FOG); + // Our goal here is to calculate this formula: + // (argColor * fog + fogColor * (255 - fog) + 255) / 256 + // Now we multiply the existing color by fog... PMULLW(argColorReg, R(fogMultReg)); - // And then inverse the fog value using those 255s we loaded, and multiply by fog color. - PSUBUSW(invertReg, R(fogMultReg)); + // Before inversing, let's add that 255 we loaded in as well, since we have it. + PADDW(argColorReg, R(invertReg)); + // And then inverse the fog value using those 255s, and multiply by fog color. + PSUBW(invertReg, R(fogMultReg)); PMULLW(fogColorReg, R(invertReg)); // At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum. - PADDUSW(argColorReg, R(fogColorReg)); + PADDW(argColorReg, R(fogColorReg)); regCache_.Release(fogColorReg, RegCache::VEC_TEMP1); regCache_.Release(invertReg, RegCache::VEC_TEMP2); regCache_.Release(fogMultReg, RegCache::VEC_TEMP3); - // Now to divide by 255, we use bit tricks: multiply by 0x8081, and shift right by 16+7. - PMULHUW(argColorReg, M(constBy255i_)); - // Now shift right by 7 (PMULHUW already did 16 of the shift.) - PSRLW(argColorReg, 7); + // Now we simply divide by 256, or in other words shift by 8. + PSRLW(argColorReg, 8); // Okay, put A back in, we'll shrink it to 8888 when needed. PINSRW(argColorReg, R(alphaReg), 3); diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index ed8848ce8068..0348f6d57af8 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -348,7 +348,7 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc return Vec3(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128())); #else - Vec3 half = Vec3::AssignToAll(1); + static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; return lhs + rhs; @@ -370,7 +370,7 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128())); #else - Vec3 half = Vec3::AssignToAll(1); + static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; return lhs - rhs; @@ -392,7 +392,7 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128())); #else - Vec3 half = Vec3::AssignToAll(1); + static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; return rhs - lhs; diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp index 424c4549bd4e..c94e23beb62a 100644 --- a/GPU/Software/RasterizerRectangle.cpp +++ b/GPU/Software/RasterizerRectangle.cpp @@ -267,8 +267,6 @@ static inline bool NoClampOrWrap(const RasterizerState &state, const Vec2f &tc) return false; if (state.samplerID.cached.sizes[0].w > 512 || state.samplerID.cached.sizes[0].h > 512) return false; - if (!state.throughMode) - return tc.x <= 1.0f && tc.y <= 1.0f; return tc.x <= state.samplerID.cached.sizes[0].w && tc.y <= state.samplerID.cached.sizes[0].h; } @@ -288,7 +286,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b // Currently only works for TL/BR, which is the most common but not required. bool orient_check = xdiff >= 0 && ydiff >= 0; // We already have a fast path for clear in ClearRectangle. - bool state_check = !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords); + bool state_check = state.throughMode && !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords); // This doesn't work well with offset drawing, see #15876. Through never has a subpixel offset. bool subpixel_check = ((v0.screenpos.x | v0.screenpos.y | v1.screenpos.x | v1.screenpos.y) & 0xF) == 0; if ((coord_check || !state.enableTextures) && orient_check && state_check && subpixel_check) { diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 2a8eeeffa6ab..28a91a7a8cad 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -482,16 +482,11 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G VertexReader vreader(decoded_, vtxfmt, vertex_type); - static VertexData data[4]; // Normally max verts per prim is 3, but we temporarily need 4 to detect rectangles from strips. - // This is the index of the next vert in data (or higher, may need modulus.) - static int data_index = 0; - - static GEPrimitiveType prev_prim = GE_PRIM_POINTS; if (prim_type != GE_PRIM_KEEP_PREVIOUS) { - data_index = 0; - prev_prim = prim_type; + data_index_ = 0; + prev_prim_ = prim_type; } else { - prim_type = prev_prim; + prim_type = prev_prim_; } int vtcs_per_prim; @@ -530,14 +525,14 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G vreader.Goto(vtx); } - data[data_index++] = ReadVertex(vreader, transformState, outside_range_flag); - if (data_index < vtcs_per_prim) { + data_[data_index_++] = ReadVertex(vreader, transformState, outside_range_flag); + if (data_index_ < vtcs_per_prim) { // Keep reading. Note: an incomplete prim will stay read for GE_PRIM_KEEP_PREVIOUS. continue; } // Okay, we've got enough verts. Reset the index for next time. - data_index = 0; + data_index_ = 0; if (outside_range_flag) { // Cull the prim if it was outside, and move to the next prim. outside_range_flag = false; @@ -548,22 +543,22 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G case GE_PRIM_TRIANGLES: { if (cullType == CullType::OFF) { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[2], *binner_); - Clipper::ProcessTriangle(data[2], data[1], data[0], data[2], *binner_); + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[2], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[2], *binner_); } else if (cullType == CullType::CW) { - Clipper::ProcessTriangle(data[2], data[1], data[0], data[2], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[2], *binner_); } else { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[2], *binner_); + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[2], *binner_); } break; } case GE_PRIM_LINES: - Clipper::ProcessLine(data[0], data[1], *binner_); + Clipper::ProcessLine(data_[0], data_[1], *binner_); break; case GE_PRIM_POINTS: - Clipper::ProcessPoint(data[0], *binner_); + Clipper::ProcessPoint(data_[0], *binner_); break; default: @@ -581,45 +576,45 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G vreader.Goto(vtx); } - data[data_index++] = ReadVertex(vreader, transformState, outside_range_flag); + data_[data_index_++] = ReadVertex(vreader, transformState, outside_range_flag); if (outside_range_flag) { outside_range_flag = false; // Note: this is the post increment index. If odd, we set the first vert. - if (data_index & 1) { + if (data_index_ & 1) { // Skip the next one and forget this one. vtx++; - data_index--; + data_index_--; } else { // Forget both of the last 2. - data_index -= 2; + data_index_ -= 2; } } - if (data_index == 4 && vreader.isThrough() && cullType == CullType::OFF) { - if (Rasterizer::DetectRectangleThroughModeSlices(binner_->State(), data)) { - data[1] = data[3]; - data_index = 2; + if (data_index_ == 4 && vreader.isThrough() && cullType == CullType::OFF) { + if (Rasterizer::DetectRectangleThroughModeSlices(binner_->State(), data_)) { + data_[1] = data_[3]; + data_index_ = 2; } } - if (data_index == 4) { - Clipper::ProcessRect(data[0], data[1], *binner_); - Clipper::ProcessRect(data[2], data[3], *binner_); - data_index = 0; + if (data_index_ == 4) { + Clipper::ProcessRect(data_[0], data_[1], *binner_); + Clipper::ProcessRect(data_[2], data_[3], *binner_); + data_index_ = 0; } } - if (data_index >= 2) { - Clipper::ProcessRect(data[0], data[1], *binner_); - data_index -= 2; + if (data_index_ >= 2) { + Clipper::ProcessRect(data_[0], data_[1], *binner_); + data_index_ -= 2; } break; case GE_PRIM_LINE_STRIP: { // Don't draw a line when loading the first vertex. - // If data_index is 1 or 2, etc., it means we're continuing a line strip. - int skip_count = data_index == 0 ? 1 : 0; + // If data_index_ is 1 or 2, etc., it means we're continuing a line strip. + int skip_count = data_index_ == 0 ? 1 : 0; for (int vtx = 0; vtx < vertex_count; ++vtx) { if (indices) { vreader.Goto(ConvertIndex(vtx) - index_lower_bound); @@ -627,7 +622,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G vreader.Goto(vtx); } - data[(data_index++) & 1] = ReadVertex(vreader, transformState, outside_range_flag); + data_[(data_index_++) & 1] = ReadVertex(vreader, transformState, outside_range_flag); if (outside_range_flag) { // Drop all primitives containing the current vertex skip_count = 2; @@ -638,8 +633,8 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G if (skip_count) { --skip_count; } else { - // We already incremented data_index, so data_index & 1 is previous one. - Clipper::ProcessLine(data[data_index & 1], data[(data_index & 1) ^ 1], *binner_); + // We already incremented data_index_, so data_index_ & 1 is previous one. + Clipper::ProcessLine(data_[data_index_ & 1], data_[(data_index_ & 1) ^ 1], *binner_); } } break; @@ -648,11 +643,11 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G case GE_PRIM_TRIANGLE_STRIP: { // Don't draw a triangle when loading the first two vertices. - int skip_count = data_index >= 2 ? 0 : 2 - data_index; + int skip_count = data_index_ >= 2 ? 0 : 2 - data_index_; // If index count == 4, check if we can convert to a rectangle. // This is for Darkstalkers (and should speed up many 2D games). - if (data_index == 0 && vertex_count == 4 && cullType == CullType::OFF) { + if (data_index_ == 0 && vertex_count == 4 && cullType == CullType::OFF) { for (int vtx = 0; vtx < 4; ++vtx) { if (indices) { vreader.Goto(ConvertIndex(vtx) - index_lower_bound); @@ -660,13 +655,13 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G else { vreader.Goto(vtx); } - data[vtx] = ReadVertex(vreader, transformState, outside_range_flag); + data_[vtx] = ReadVertex(vreader, transformState, outside_range_flag); } // If a strip is effectively a rectangle, draw it as such! int tl = -1, br = -1; - if (!outside_range_flag && Rasterizer::DetectRectangleFromStrip(binner_->State(), data, &tl, &br)) { - Clipper::ProcessRect(data[tl], data[br], *binner_); + if (!outside_range_flag && Rasterizer::DetectRectangleFromStrip(binner_->State(), data_, &tl, &br)) { + Clipper::ProcessRect(data_[tl], data_[br], *binner_); break; } } @@ -679,8 +674,8 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G vreader.Goto(vtx); } - int provoking_index = (data_index++) % 3; - data[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag); + int provoking_index = (data_index_++) % 3; + data_[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag); if (outside_range_flag) { // Drop all primitives containing the current vertex skip_count = 2; @@ -694,14 +689,14 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G } if (cullType == CullType::OFF) { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[provoking_index], *binner_); - Clipper::ProcessTriangle(data[2], data[1], data[0], data[provoking_index], *binner_); - } else if ((!(int)cullType) ^ ((data_index - 1) % 2)) { + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[provoking_index], *binner_); + } else if ((!(int)cullType) ^ ((data_index_ - 1) % 2)) { // We need to reverse the vertex order for each second primitive, // but we additionally need to do that for every primitive if CCW cullmode is used. - Clipper::ProcessTriangle(data[2], data[1], data[0], data[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[provoking_index], *binner_); } else { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[provoking_index], *binner_); } } break; @@ -711,18 +706,18 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G { // Don't draw a triangle when loading the first two vertices. // (this doesn't count the central one.) - int skip_count = data_index <= 1 ? 1 : 0; + int skip_count = data_index_ <= 1 ? 1 : 0; int start_vtx = 0; // Only read the central vertex if we're not continuing. - if (data_index == 0) { + if (data_index_ == 0) { if (indices) { vreader.Goto(ConvertIndex(0) - index_lower_bound); } else { vreader.Goto(0); } - data[0] = ReadVertex(vreader, transformState, outside_range_flag); - data_index++; + data_[0] = ReadVertex(vreader, transformState, outside_range_flag); + data_index_++; start_vtx = 1; // If the central vertex is outside range, all the points are toast. @@ -730,19 +725,19 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G break; } - if (data_index == 1 && vertex_count == 4 && cullType == CullType::OFF) { + if (data_index_ == 1 && vertex_count == 4 && cullType == CullType::OFF) { for (int vtx = start_vtx; vtx < vertex_count; ++vtx) { if (indices) { vreader.Goto(ConvertIndex(vtx) - index_lower_bound); } else { vreader.Goto(vtx); } - data[vtx] = ReadVertex(vreader, transformState, outside_range_flag); + data_[vtx] = ReadVertex(vreader, transformState, outside_range_flag); } int tl = -1, br = -1; - if (!outside_range_flag && Rasterizer::DetectRectangleFromFan(binner_->State(), data, vertex_count, &tl, &br)) { - Clipper::ProcessRect(data[tl], data[br], *binner_); + if (!outside_range_flag && Rasterizer::DetectRectangleFromFan(binner_->State(), data_, vertex_count, &tl, &br)) { + Clipper::ProcessRect(data_[tl], data_[br], *binner_); break; } } @@ -755,8 +750,8 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G vreader.Goto(vtx); } - int provoking_index = 2 - ((data_index++) % 2); - data[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag); + int provoking_index = 2 - ((data_index_++) % 2); + data_[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag); if (outside_range_flag) { // Drop all primitives containing the current vertex skip_count = 2; @@ -770,14 +765,14 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G } if (cullType == CullType::OFF) { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[provoking_index], *binner_); - Clipper::ProcessTriangle(data[2], data[1], data[0], data[provoking_index], *binner_); - } else if ((!(int)cullType) ^ ((data_index - 1) % 2)) { + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[provoking_index], *binner_); + } else if ((!(int)cullType) ^ ((data_index_ - 1) % 2)) { // We need to reverse the vertex order for each second primitive, // but we additionally need to do that for every primitive if CCW cullmode is used. - Clipper::ProcessTriangle(data[2], data[1], data[0], data[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[2], data_[1], data_[0], data_[provoking_index], *binner_); } else { - Clipper::ProcessTriangle(data[0], data[1], data[2], data[provoking_index], *binner_); + Clipper::ProcessTriangle(data_[0], data_[1], data_[2], data_[provoking_index], *binner_); } } break; diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h index 54f9fb5f5ec7..faa761639c57 100644 --- a/GPU/Software/TransformUnit.h +++ b/GPU/Software/TransformUnit.h @@ -136,6 +136,12 @@ class TransformUnit { u8 *decoded_ = nullptr; BinManager *binner_ = nullptr; + + // Normally max verts per prim is 3, but we temporarily need 4 to detect rectangles from strips. + VertexData data_[4]; + // This is the index of the next vert in data (or higher, may need modulus.) + int data_index_ = 0; + GEPrimitiveType prev_prim_ = GE_PRIM_POINTS; }; class SoftwareDrawEngine : public DrawEngineCommon {