mirror of
				https://git.zaroz.cloud/nintendo-back-up/yuzu/yuzu.git
				synced 2025-05-12 00:45:25 +00:00 
			
		
		
		
	Merge pull request #562 from neobrain/pica_progress3
More PICA200 Emulation Fixes
This commit is contained in:
		
						commit
						4a48b017ca
					
				| @ -372,15 +372,15 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { | |||||||
|                          Memory::VirtualToPhysicalAddress(params.start1) >> 3); |                          Memory::VirtualToPhysicalAddress(params.start1) >> 3); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), | ||||||
|                          Memory::VirtualToPhysicalAddress(params.end1) >> 3); |                          Memory::VirtualToPhysicalAddress(params.end1) >> 3); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1); |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1); |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1); | ||||||
| 
 | 
 | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), | ||||||
|                          Memory::VirtualToPhysicalAddress(params.start2) >> 3); |                          Memory::VirtualToPhysicalAddress(params.start2) >> 3); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), | ||||||
|                          Memory::VirtualToPhysicalAddress(params.end2) >> 3); |                          Memory::VirtualToPhysicalAddress(params.end2) >> 3); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2); |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2); | ||||||
|         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2); |         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2); | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -109,9 +109,13 @@ struct Command { | |||||||
|             u32 start1; |             u32 start1; | ||||||
|             u32 value1; |             u32 value1; | ||||||
|             u32 end1; |             u32 end1; | ||||||
|  | 
 | ||||||
|             u32 start2; |             u32 start2; | ||||||
|             u32 value2; |             u32 value2; | ||||||
|             u32 end2; |             u32 end2; | ||||||
|  | 
 | ||||||
|  |             u16 control1; | ||||||
|  |             u16 control2; | ||||||
|         } memory_fill; |         } memory_fill; | ||||||
| 
 | 
 | ||||||
|         struct { |         struct { | ||||||
|  | |||||||
| @ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) { | |||||||
|     switch (index) { |     switch (index) { | ||||||
| 
 | 
 | ||||||
|     // Memory fills are triggered once the fill value is written.
 |     // Memory fills are triggered once the fill value is written.
 | ||||||
|     // NOTE: This is not verified.
 |     case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3): | ||||||
|     case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3): |     case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3): | ||||||
|     case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3): |  | ||||||
|     { |     { | ||||||
|         const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value)); |         const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger)); | ||||||
|         const auto& config = g_regs.memory_fill_config[is_second_filler]; |         auto& config = g_regs.memory_fill_config[is_second_filler]; | ||||||
| 
 | 
 | ||||||
|         // TODO: Not sure if this check should be done at GSP level instead
 |         if (config.address_start && config.trigger) { | ||||||
|         if (config.address_start) { |             u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); | ||||||
|             // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
 |             u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); | ||||||
|             u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); | 
 | ||||||
|             u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); |             if (config.fill_24bit) { | ||||||
|             for (u32* ptr = start; ptr < end; ++ptr) |                 // fill with 24-bit values
 | ||||||
|                 *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
 |                 for (u8* ptr = start; ptr < end; ptr += 3) { | ||||||
|  |                     ptr[0] = config.value_24bit_b; | ||||||
|  |                     ptr[1] = config.value_24bit_g; | ||||||
|  |                     ptr[2] = config.value_24bit_r; | ||||||
|  |                 } | ||||||
|  |             } else if (config.fill_32bit) { | ||||||
|  |                 // fill with 32-bit values
 | ||||||
|  |                 for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr) | ||||||
|  |                     *ptr = config.value_32bit; | ||||||
|  |             } else { | ||||||
|  |                 // fill with 16-bit values
 | ||||||
|  |                 for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr) | ||||||
|  |                     *ptr = config.value_16bit; | ||||||
|  |             } | ||||||
| 
 | 
 | ||||||
|             LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress()); |             LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress()); | ||||||
| 
 | 
 | ||||||
|  |             config.trigger = 0; | ||||||
|  |             config.finished = 1; | ||||||
|  | 
 | ||||||
|             if (!is_second_filler) { |             if (!is_second_filler) { | ||||||
|                 GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0); |                 GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0); | ||||||
|             } else { |             } else { | ||||||
|  | |||||||
| @ -84,9 +84,35 @@ struct Regs { | |||||||
| 
 | 
 | ||||||
|     struct { |     struct { | ||||||
|         u32 address_start; |         u32 address_start; | ||||||
|         u32 address_end; // ?
 |         u32 address_end; | ||||||
|         u32 size; | 
 | ||||||
|         u32 value; // ?
 |         union { | ||||||
|  |             u32 value_32bit; | ||||||
|  | 
 | ||||||
|  |             BitField<0, 16, u32> value_16bit; | ||||||
|  | 
 | ||||||
|  |             // TODO: Verify component order
 | ||||||
|  |             BitField< 0, 8, u32> value_24bit_r; | ||||||
|  |             BitField< 8, 8, u32> value_24bit_g; | ||||||
|  |             BitField<16, 8, u32> value_24bit_b; | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         union { | ||||||
|  |             u32 control; | ||||||
|  | 
 | ||||||
|  |             // Setting this field to 1 triggers the memory fill.
 | ||||||
|  |             // This field also acts as a status flag, and gets reset to 0 upon completion.
 | ||||||
|  |             BitField<0, 1, u32> trigger; | ||||||
|  | 
 | ||||||
|  |             // Set to 1 upon completion.
 | ||||||
|  |             BitField<0, 1, u32> finished; | ||||||
|  | 
 | ||||||
|  |             // 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values
 | ||||||
|  |             BitField<8, 1, u32> fill_24bit; | ||||||
|  | 
 | ||||||
|  |             // 0: fill with 16-bit wide values; 1: fill with 32-bit wide values
 | ||||||
|  |             BitField<9, 1, u32> fill_32bit; | ||||||
|  |         }; | ||||||
| 
 | 
 | ||||||
|         inline u32 GetStartAddress() const { |         inline u32 GetStartAddress() const { | ||||||
|             return DecodeAddressRegister(address_start); |             return DecodeAddressRegister(address_start); | ||||||
|  | |||||||
| @ -15,30 +15,18 @@ namespace Clipper { | |||||||
| 
 | 
 | ||||||
| struct ClippingEdge { | struct ClippingEdge { | ||||||
| public: | public: | ||||||
|     enum Type { |     ClippingEdge(Math::Vec4<float24> coeffs, | ||||||
|         POS_X = 0, |                  Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0), | ||||||
|         NEG_X = 1, |                                                                 float24::FromFloat32(0), | ||||||
|         POS_Y = 2, |                                                                 float24::FromFloat32(0), | ||||||
|         NEG_Y = 3, |                                                                 float24::FromFloat32(0))) | ||||||
|         POS_Z = 4, |         : coeffs(coeffs), | ||||||
|         NEG_Z = 5, |           bias(bias) | ||||||
|     }; |     { | ||||||
| 
 |     } | ||||||
|     ClippingEdge(Type type, float24 position) : type(type), pos(position) {} |  | ||||||
| 
 | 
 | ||||||
|     bool IsInside(const OutputVertex& vertex) const { |     bool IsInside(const OutputVertex& vertex) const { | ||||||
|         switch (type) { |         return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0); | ||||||
|         case POS_X: return vertex.pos.x <= pos * vertex.pos.w; |  | ||||||
|         case NEG_X: return vertex.pos.x >= pos * vertex.pos.w; |  | ||||||
|         case POS_Y: return vertex.pos.y <= pos * vertex.pos.w; |  | ||||||
|         case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w; |  | ||||||
| 
 |  | ||||||
|         // TODO: Check z compares ... should be 0..1 instead?
 |  | ||||||
|         case POS_Z: return vertex.pos.z <= pos * vertex.pos.w; |  | ||||||
| 
 |  | ||||||
|         default: |  | ||||||
|         case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w; |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     bool IsOutSide(const OutputVertex& vertex) const { |     bool IsOutSide(const OutputVertex& vertex) const { | ||||||
| @ -46,31 +34,17 @@ public: | |||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { |     OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { | ||||||
|         auto dotpr = [this](const OutputVertex& vtx) { |         float24 dp = Math::Dot(v0.pos + bias, coeffs); | ||||||
|             switch (type) { |         float24 dp_prev = Math::Dot(v1.pos + bias, coeffs); | ||||||
|             case POS_X: return vtx.pos.x - vtx.pos.w; |  | ||||||
|             case NEG_X: return -vtx.pos.x - vtx.pos.w; |  | ||||||
|             case POS_Y: return vtx.pos.y - vtx.pos.w; |  | ||||||
|             case NEG_Y: return -vtx.pos.y - vtx.pos.w; |  | ||||||
| 
 |  | ||||||
|             // TODO: Verify z clipping
 |  | ||||||
|             case POS_Z: return vtx.pos.z - vtx.pos.w; |  | ||||||
| 
 |  | ||||||
|             default: |  | ||||||
|             case NEG_Z: return -vtx.pos.w; |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         float24 dp = dotpr(v0); |  | ||||||
|         float24 dp_prev = dotpr(v1); |  | ||||||
|         float24 factor = dp_prev / (dp_prev - dp); |         float24 factor = dp_prev / (dp_prev - dp); | ||||||
| 
 | 
 | ||||||
|         return OutputVertex::Lerp(factor, v0, v1); |         return OutputVertex::Lerp(factor, v0, v1); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|     Type type; |  | ||||||
|     float24 pos; |     float24 pos; | ||||||
|  |     Math::Vec4<float24> coeffs; | ||||||
|  |     Math::Vec4<float24> bias; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static void InitScreenCoordinates(OutputVertex& vtx) | static void InitScreenCoordinates(OutputVertex& vtx) | ||||||
| @ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx) | |||||||
|     vtx.tc2 *= inv_w; |     vtx.tc2 *= inv_w; | ||||||
|     vtx.pos.w = inv_w; |     vtx.pos.w = inv_w; | ||||||
| 
 | 
 | ||||||
|     // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
 |  | ||||||
|     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; |     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; | ||||||
|     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; |     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; | ||||||
|     vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; |     vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { | void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { | ||||||
| @ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { | |||||||
|     auto* output_list = &buffer_a; |     auto* output_list = &buffer_a; | ||||||
|     auto* input_list  = &buffer_b; |     auto* input_list  = &buffer_b; | ||||||
| 
 | 
 | ||||||
|  |     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
 | ||||||
|  |     // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
 | ||||||
|  |     //       epsilon possible within float24 accuracy.
 | ||||||
|  |     static const float24 EPSILON = float24::FromFloat32(0.00001); | ||||||
|  |     static const float24 f0 = float24::FromFloat32(0.0); | ||||||
|  |     static const float24 f1 = float24::FromFloat32(1.0); | ||||||
|  |     static const std::array<ClippingEdge, 7> clipping_edges = {{ | ||||||
|  |         { Math::MakeVec( f1,  f0,  f0, -f1) },  // x = +w
 | ||||||
|  |         { Math::MakeVec(-f1,  f0,  f0, -f1) },  // x = -w
 | ||||||
|  |         { Math::MakeVec( f0,  f1,  f0, -f1) },  // y = +w
 | ||||||
|  |         { Math::MakeVec( f0, -f1,  f0, -f1) },  // y = -w
 | ||||||
|  |         { Math::MakeVec( f0,  f0,  f1,  f0) },  // z =  0
 | ||||||
|  |         { Math::MakeVec( f0,  f0, -f1, -f1) },  // z = -w
 | ||||||
|  |         { Math::MakeVec( f0,  f0,  f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
 | ||||||
|  |     }}; | ||||||
|  | 
 | ||||||
|  |     // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
 | ||||||
|  |     //       drop the whole primitive instead of clipping the primitive properly. We should test if
 | ||||||
|  |     //       this happens on the 3DS, too.
 | ||||||
|  | 
 | ||||||
|     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
 |     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
 | ||||||
|     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
 |     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
 | ||||||
|     for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)), |     for (auto edge : clipping_edges) { | ||||||
|                        ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)), |  | ||||||
|                        ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)), |  | ||||||
|                        ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)), |  | ||||||
|                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), |  | ||||||
|                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { |  | ||||||
| 
 | 
 | ||||||
|         std::swap(input_list, output_list); |         std::swap(input_list, output_list); | ||||||
|         output_list->clear(); |         output_list->clear(); | ||||||
|  | |||||||
| @ -2,6 +2,8 @@ | |||||||
| // Licensed under GPLv2 or any later version
 | // Licensed under GPLv2 or any later version
 | ||||||
| // Refer to the license.txt file included.
 | // Refer to the license.txt file included.
 | ||||||
| 
 | 
 | ||||||
|  | #include <boost/range/algorithm/fill.hpp> | ||||||
|  | 
 | ||||||
| #include "clipper.h" | #include "clipper.h" | ||||||
| #include "command_processor.h" | #include "command_processor.h" | ||||||
| #include "math.h" | #include "math.h" | ||||||
| @ -23,10 +25,6 @@ static int float_regs_counter = 0; | |||||||
| 
 | 
 | ||||||
| static u32 uniform_write_buffer[4]; | static u32 uniform_write_buffer[4]; | ||||||
| 
 | 
 | ||||||
| // Used for VSLoadProgramData and VSLoadSwizzleData
 |  | ||||||
| static u32 vs_binary_write_offset = 0; |  | ||||||
| static u32 vs_swizzle_write_offset = 0; |  | ||||||
| 
 |  | ||||||
| static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
| 
 | 
 | ||||||
|     if (id >= registers.NumIds()) |     if (id >= registers.NumIds()) | ||||||
| @ -65,10 +63,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||||||
| 
 | 
 | ||||||
|             // Information about internal vertex attributes
 |             // Information about internal vertex attributes
 | ||||||
|             u32 vertex_attribute_sources[16]; |             u32 vertex_attribute_sources[16]; | ||||||
|             std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef); |             boost::fill(vertex_attribute_sources, 0xdeadbeef); | ||||||
|             u32 vertex_attribute_strides[16]; |             u32 vertex_attribute_strides[16]; | ||||||
|             u32 vertex_attribute_formats[16]; |             u32 vertex_attribute_formats[16]; | ||||||
|             u32 vertex_attribute_elements[16]; | 
 | ||||||
|  |             // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
 | ||||||
|  |             // This is one of the hacks required to deal with uninitalized vertex attributes.
 | ||||||
|  |             // TODO: Fix this properly.
 | ||||||
|  |             u32 vertex_attribute_elements[16] = {}; | ||||||
|             u32 vertex_attribute_element_size[16]; |             u32 vertex_attribute_element_size[16]; | ||||||
| 
 | 
 | ||||||
|             // Setup attribute data from loaders
 |             // Setup attribute data from loaders
 | ||||||
| @ -252,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // Seems to be used to reset the write pointer for VSLoadProgramData
 |  | ||||||
|         case PICA_REG_INDEX(vs_program.begin_load): |  | ||||||
|             vs_binary_write_offset = 0; |  | ||||||
|             break; |  | ||||||
| 
 |  | ||||||
|         // Load shader program code
 |         // Load shader program code
 | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): |         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): |         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): | ||||||
| @ -267,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): |         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): |         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): | ||||||
|         { |         { | ||||||
|             VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value); |             VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value); | ||||||
|             vs_binary_write_offset++; |             registers.vs_program.offset++; | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // Seems to be used to reset the write pointer for VSLoadSwizzleData
 |  | ||||||
|         case PICA_REG_INDEX(vs_swizzle_patterns.begin_load): |  | ||||||
|             vs_swizzle_write_offset = 0; |  | ||||||
|             break; |  | ||||||
| 
 |  | ||||||
|         // Load swizzle pattern data
 |         // Load swizzle pattern data
 | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): |         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): |         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): | ||||||
| @ -287,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | |||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): |         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): | ||||||
|         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): |         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): | ||||||
|         { |         { | ||||||
|             VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value); |             VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value); | ||||||
|             vs_swizzle_write_offset++; |             registers.vs_swizzle_patterns.offset++; | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -120,6 +120,7 @@ struct Regs { | |||||||
|         enum WrapMode : u32 { |         enum WrapMode : u32 { | ||||||
|             ClampToEdge    = 0, |             ClampToEdge    = 0, | ||||||
|             Repeat         = 2, |             Repeat         = 2, | ||||||
|  |             MirroredRepeat = 3, | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         INSERT_PADDING_WORDS(0x1); |         INSERT_PADDING_WORDS(0x1); | ||||||
| @ -131,7 +132,7 @@ struct Regs { | |||||||
| 
 | 
 | ||||||
|         union { |         union { | ||||||
|             BitField< 8, 2, WrapMode> wrap_s; |             BitField< 8, 2, WrapMode> wrap_s; | ||||||
|             BitField<11, 2, WrapMode> wrap_t; |             BitField<12, 2, WrapMode> wrap_t; | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         INSERT_PADDING_WORDS(0x1); |         INSERT_PADDING_WORDS(0x1); | ||||||
| @ -223,6 +224,8 @@ struct Regs { | |||||||
|     struct TevStageConfig { |     struct TevStageConfig { | ||||||
|         enum class Source : u32 { |         enum class Source : u32 { | ||||||
|             PrimaryColor           = 0x0, |             PrimaryColor           = 0x0, | ||||||
|  |             PrimaryFragmentColor   = 0x1, | ||||||
|  | 
 | ||||||
|             Texture0               = 0x3, |             Texture0               = 0x3, | ||||||
|             Texture1               = 0x4, |             Texture1               = 0x4, | ||||||
|             Texture2               = 0x5, |             Texture2               = 0x5, | ||||||
| @ -265,6 +268,9 @@ struct Regs { | |||||||
|             AddSigned       = 3, |             AddSigned       = 3, | ||||||
|             Lerp            = 4, |             Lerp            = 4, | ||||||
|             Subtract        = 5, |             Subtract        = 5, | ||||||
|  | 
 | ||||||
|  |             MultiplyThenAdd = 8, | ||||||
|  |             AddThenMultiply = 9, | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         union { |         union { | ||||||
| @ -337,7 +343,7 @@ struct Regs { | |||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         union { |         union { | ||||||
|             enum BlendEquation : u32 { |             enum class BlendEquation : u32 { | ||||||
|                 Add             = 0, |                 Add             = 0, | ||||||
|                 Subtract        = 1, |                 Subtract        = 1, | ||||||
|                 ReverseSubtract = 2, |                 ReverseSubtract = 2, | ||||||
| @ -421,7 +427,7 @@ struct Regs { | |||||||
|         INSERT_PADDING_WORDS(0x6); |         INSERT_PADDING_WORDS(0x6); | ||||||
| 
 | 
 | ||||||
|         u32 depth_format; |         u32 depth_format; | ||||||
|         u32 color_format; |         BitField<16, 3, u32> color_format; | ||||||
| 
 | 
 | ||||||
|         INSERT_PADDING_WORDS(0x4); |         INSERT_PADDING_WORDS(0x4); | ||||||
| 
 | 
 | ||||||
| @ -678,7 +684,9 @@ struct Regs { | |||||||
|     INSERT_PADDING_WORDS(0x2); |     INSERT_PADDING_WORDS(0x2); | ||||||
| 
 | 
 | ||||||
|     struct { |     struct { | ||||||
|         u32 begin_load; |         // Offset of the next instruction to write code to.
 | ||||||
|  |         // Incremented with each instruction write.
 | ||||||
|  |         u32 offset; | ||||||
| 
 | 
 | ||||||
|         // Writing to these registers sets the "current" word in the shader program.
 |         // Writing to these registers sets the "current" word in the shader program.
 | ||||||
|         // TODO: It's not clear how the hardware stores what the "current" word is.
 |         // TODO: It's not clear how the hardware stores what the "current" word is.
 | ||||||
| @ -690,7 +698,9 @@ struct Regs { | |||||||
|     // This register group is used to load an internal table of swizzling patterns,
 |     // This register group is used to load an internal table of swizzling patterns,
 | ||||||
|     // which are indexed by each shader instruction to specify vector component swizzling.
 |     // which are indexed by each shader instruction to specify vector component swizzling.
 | ||||||
|     struct { |     struct { | ||||||
|         u32 begin_load; |         // Offset of the next swizzle pattern to write code to.
 | ||||||
|  |         // Incremented with each instruction write.
 | ||||||
|  |         u32 offset; | ||||||
| 
 | 
 | ||||||
|         // Writing to these registers sets the "current" swizzle pattern in the table.
 |         // Writing to these registers sets the "current" swizzle pattern in the table.
 | ||||||
|         // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
 |         // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
 | ||||||
|  | |||||||
| @ -5,6 +5,7 @@ | |||||||
| #include <algorithm> | #include <algorithm> | ||||||
| 
 | 
 | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
|  | #include "common/math_util.h" | ||||||
| 
 | 
 | ||||||
| #include "math.h" | #include "math.h" | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
| @ -20,16 +21,31 @@ namespace Rasterizer { | |||||||
| static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) { | static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) { | ||||||
|     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); |     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); | ||||||
|     u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); |     u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); | ||||||
|     u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b(); |  | ||||||
| 
 | 
 | ||||||
|     // Assuming RGBA8 format until actual framebuffer format handling is implemented
 |     // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
 | ||||||
|  |     // NOTE: The framebuffer height register contains the actual FB height minus one.
 | ||||||
|  |     y = (registers.framebuffer.height - y); | ||||||
|  | 
 | ||||||
|  |     switch (registers.framebuffer.color_format) { | ||||||
|  |     case registers.framebuffer.RGBA8: | ||||||
|  |     { | ||||||
|  |         u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b(); | ||||||
|         *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value; |         *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value; | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     default: | ||||||
|  |         LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format); | ||||||
|  |         UNIMPLEMENTED(); | ||||||
|  |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static const Math::Vec4<u8> GetPixel(int x, int y) { | static const Math::Vec4<u8> GetPixel(int x, int y) { | ||||||
|     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); |     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress(); | ||||||
|     u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); |     u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr))); | ||||||
| 
 | 
 | ||||||
|  |     y = (registers.framebuffer.height - y); | ||||||
|  | 
 | ||||||
|     u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth()); |     u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth()); | ||||||
|     Math::Vec4<u8> ret; |     Math::Vec4<u8> ret; | ||||||
|     ret.a() = value >> 24; |     ret.a() = value >> 24; | ||||||
| @ -43,6 +59,8 @@ static u32 GetDepth(int x, int y) { | |||||||
|     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); |     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); | ||||||
|     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); |     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); | ||||||
| 
 | 
 | ||||||
|  |     y = (registers.framebuffer.height - y); | ||||||
|  | 
 | ||||||
|     // Assuming 16-bit depth buffer format until actual format handling is implemented
 |     // Assuming 16-bit depth buffer format until actual format handling is implemented
 | ||||||
|     return *(depth_buffer + x + y * registers.framebuffer.GetWidth()); |     return *(depth_buffer + x + y * registers.framebuffer.GetWidth()); | ||||||
| } | } | ||||||
| @ -51,6 +69,8 @@ static void SetDepth(int x, int y, u16 value) { | |||||||
|     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); |     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress(); | ||||||
|     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); |     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr))); | ||||||
| 
 | 
 | ||||||
|  |     y = (registers.framebuffer.height - y); | ||||||
|  | 
 | ||||||
|     // Assuming 16-bit depth buffer format until actual format handling is implemented
 |     // Assuming 16-bit depth buffer format until actual format handling is implemented
 | ||||||
|     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value; |     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value; | ||||||
| } | } | ||||||
| @ -90,15 +110,22 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1, | |||||||
|     return Math::Cross(vec1, vec2).z; |     return Math::Cross(vec1, vec2).z; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| void ProcessTriangle(const VertexShader::OutputVertex& v0, | /**
 | ||||||
|  |  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing | ||||||
|  |  * culling via recursion. | ||||||
|  |  */ | ||||||
|  | static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, | ||||||
|                                     const VertexShader::OutputVertex& v1, |                                     const VertexShader::OutputVertex& v1, | ||||||
|                      const VertexShader::OutputVertex& v2) |                                     const VertexShader::OutputVertex& v2, | ||||||
|  |                                     bool reversed = false) | ||||||
| { | { | ||||||
|     // vertex positions in rasterizer coordinates
 |     // vertex positions in rasterizer coordinates
 | ||||||
|     auto FloatToFix = [](float24 flt) { |     static auto FloatToFix = [](float24 flt) { | ||||||
|                           return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f)); |         // TODO: Rounding here is necessary to prevent garbage pixels at
 | ||||||
|  |         //       triangle borders. Is it that the correct solution, though?
 | ||||||
|  |         return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f))); | ||||||
|     }; |     }; | ||||||
|     auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) { |     static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) { | ||||||
|         return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; |         return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
| @ -106,14 +133,20 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|                                    ScreenToRasterizerCoordinates(v1.screenpos), |                                    ScreenToRasterizerCoordinates(v1.screenpos), | ||||||
|                                    ScreenToRasterizerCoordinates(v2.screenpos) }; |                                    ScreenToRasterizerCoordinates(v2.screenpos) }; | ||||||
| 
 | 
 | ||||||
|     if (registers.cull_mode == Regs::CullMode::KeepClockWise) { |     if (registers.cull_mode == Regs::CullMode::KeepAll) { | ||||||
|  |         // Make sure we always end up with a triangle wound counter-clockwise
 | ||||||
|  |         if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { | ||||||
|  |             ProcessTriangleInternal(v0, v2, v1, true); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) { | ||||||
|             // Reverse vertex order and use the CCW code path.
 |             // Reverse vertex order and use the CCW code path.
 | ||||||
|         std::swap(vtxpos[1], vtxpos[2]); |             ProcessTriangleInternal(v0, v2, v1, true); | ||||||
|  |             return; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|     if (registers.cull_mode != Regs::CullMode::KeepAll) { |  | ||||||
|         // Cull away triangles which are wound clockwise.
 |         // Cull away triangles which are wound clockwise.
 | ||||||
|         // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
 |  | ||||||
|         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) |         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) | ||||||
|             return; |             return; | ||||||
|     } |     } | ||||||
| @ -155,9 +188,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|     auto textures = registers.GetTextures(); |     auto textures = registers.GetTextures(); | ||||||
|     auto tev_stages = registers.GetTevStages(); |     auto tev_stages = registers.GetTevStages(); | ||||||
| 
 | 
 | ||||||
|  |     // Enter rasterization loop, starting at the center of the topleft bounding box corner.
 | ||||||
|     // TODO: Not sure if looping through x first might be faster
 |     // TODO: Not sure if looping through x first might be faster
 | ||||||
|     for (u16 y = min_y; y < max_y; y += 0x10) { |     for (u16 y = min_y + 8; y < max_y; y += 0x10) { | ||||||
|         for (u16 x = min_x; x < max_x; x += 0x10) { |         for (u16 x = min_x + 8; x < max_x; x += 0x10) { | ||||||
| 
 | 
 | ||||||
|             // Calculate the barycentric coordinates w0, w1 and w2
 |             // Calculate the barycentric coordinates w0, w1 and w2
 | ||||||
|             int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); |             int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); | ||||||
| @ -220,7 +254,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
| 
 | 
 | ||||||
|                 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32(); |                 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32(); | ||||||
|                 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32(); |                 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32(); | ||||||
|                 auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) { |                 static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) { | ||||||
|                     switch (mode) { |                     switch (mode) { | ||||||
|                         case Regs::TextureConfig::ClampToEdge: |                         case Regs::TextureConfig::ClampToEdge: | ||||||
|                             val = std::max(val, 0); |                             val = std::max(val, 0); | ||||||
| @ -228,7 +262,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|                             return val; |                             return val; | ||||||
| 
 | 
 | ||||||
|                         case Regs::TextureConfig::Repeat: |                         case Regs::TextureConfig::Repeat: | ||||||
|                             return (int)(((unsigned)val) % size); |                             return (int)((unsigned)val % size); | ||||||
|  | 
 | ||||||
|  |                         case Regs::TextureConfig::MirroredRepeat: | ||||||
|  |                         { | ||||||
|  |                             int val = (int)((unsigned)val % (2 * size)); | ||||||
|  |                             if (val >= size) | ||||||
|  |                                 val = 2 * size - 1 - val; | ||||||
|  |                             return val; | ||||||
|  |                         } | ||||||
| 
 | 
 | ||||||
|                         default: |                         default: | ||||||
|                             LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode); |                             LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode); | ||||||
| @ -236,6 +278,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|                             return 0; |                             return 0; | ||||||
|                     } |                     } | ||||||
|                 }; |                 }; | ||||||
|  | 
 | ||||||
|  |                 // Textures are laid out from bottom to top, hence we invert the t coordinate.
 | ||||||
|  |                 // NOTE: This may not be the right place for the inversion.
 | ||||||
|  |                 // TODO: Check if this applies to ETC textures, too.
 | ||||||
|                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); |                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); | ||||||
|                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); |                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); | ||||||
| 
 | 
 | ||||||
| @ -262,7 +308,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
| 
 | 
 | ||||||
|                 auto GetSource = [&](Source source) -> Math::Vec4<u8> { |                 auto GetSource = [&](Source source) -> Math::Vec4<u8> { | ||||||
|                     switch (source) { |                     switch (source) { | ||||||
|  |                     // TODO: What's the difference between these two?
 | ||||||
|                     case Source::PrimaryColor: |                     case Source::PrimaryColor: | ||||||
|  |                     case Source::PrimaryFragmentColor: | ||||||
|                         return primary_color; |                         return primary_color; | ||||||
| 
 | 
 | ||||||
|                     case Source::Texture0: |                     case Source::Texture0: | ||||||
| @ -378,6 +426,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|                         return result.Cast<u8>(); |                         return result.Cast<u8>(); | ||||||
|                     } |                     } | ||||||
| 
 | 
 | ||||||
|  |                     case Operation::MultiplyThenAdd: | ||||||
|  |                     { | ||||||
|  |                         auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255; | ||||||
|  |                         result.r() = std::min(255, result.r()); | ||||||
|  |                         result.g() = std::min(255, result.g()); | ||||||
|  |                         result.b() = std::min(255, result.b()); | ||||||
|  |                         return result.Cast<u8>(); | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|  |                     case Operation::AddThenMultiply: | ||||||
|  |                     { | ||||||
|  |                         auto result = input[0] + input[1]; | ||||||
|  |                         result.r() = std::min(255, result.r()); | ||||||
|  |                         result.g() = std::min(255, result.g()); | ||||||
|  |                         result.b() = std::min(255, result.b()); | ||||||
|  |                         result = (result * input[2].Cast<int>()) / 255; | ||||||
|  |                         return result.Cast<u8>(); | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|                     default: |                     default: | ||||||
|                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op); |                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op); | ||||||
|                         UNIMPLEMENTED(); |                         UNIMPLEMENTED(); | ||||||
| @ -402,6 +469,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|                     case Operation::Subtract: |                     case Operation::Subtract: | ||||||
|                         return std::max(0, (int)input[0] - (int)input[1]); |                         return std::max(0, (int)input[0] - (int)input[1]); | ||||||
| 
 | 
 | ||||||
|  |                     case Operation::MultiplyThenAdd: | ||||||
|  |                         return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255); | ||||||
|  | 
 | ||||||
|  |                     case Operation::AddThenMultiply: | ||||||
|  |                         return (std::min(255, (input[0] + input[1])) * input[2]) / 255; | ||||||
|  | 
 | ||||||
|                     default: |                     default: | ||||||
|                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op); |                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op); | ||||||
|                         UNIMPLEMENTED(); |                         UNIMPLEMENTED(); | ||||||
| @ -475,7 +548,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
| 
 | 
 | ||||||
|             // TODO: Does depth indeed only get written even if depth testing is enabled?
 |             // TODO: Does depth indeed only get written even if depth testing is enabled?
 | ||||||
|             if (registers.output_merger.depth_test_enable) { |             if (registers.output_merger.depth_test_enable) { | ||||||
|                 u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 + |                 u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 + | ||||||
|                             v1.screenpos[2].ToFloat32() * w1 + |                             v1.screenpos[2].ToFloat32() * w1 + | ||||||
|                             v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); |                             v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); | ||||||
|                 u16 ref_z = GetDepth(x >> 4, y >> 4); |                 u16 ref_z = GetDepth(x >> 4, y >> 4); | ||||||
| @ -524,6 +597,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             auto dest = GetPixel(x >> 4, y >> 4); |             auto dest = GetPixel(x >> 4, y >> 4); | ||||||
|  |             Math::Vec4<u8> blend_output = combiner_output; | ||||||
| 
 | 
 | ||||||
|             if (registers.output_merger.alphablend_enable) { |             if (registers.output_merger.alphablend_enable) { | ||||||
|                 auto params = registers.output_merger.alpha_blending; |                 auto params = registers.output_merger.alpha_blending; | ||||||
| @ -574,7 +648,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
| 
 | 
 | ||||||
|                     default: |                     default: | ||||||
|                         LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor); |                         LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor); | ||||||
|                         exit(0); |                         UNIMPLEMENTED(); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                 }; |                 }; | ||||||
| @ -607,86 +681,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
| 
 | 
 | ||||||
|                     default: |                     default: | ||||||
|                         LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor); |                         LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor); | ||||||
|                         exit(0); |                         UNIMPLEMENTED(); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                 }; |                 }; | ||||||
| 
 | 
 | ||||||
|  |                 using BlendEquation = decltype(params)::BlendEquation; | ||||||
|  |                 static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor, | ||||||
|  |                                                        const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor, | ||||||
|  |                                                        BlendEquation equation) { | ||||||
|  |                     Math::Vec4<int> result; | ||||||
|  | 
 | ||||||
|  |                     auto src_result = (src  *  srcfactor).Cast<int>(); | ||||||
|  |                     auto dst_result = (dest * destfactor).Cast<int>(); | ||||||
|  | 
 | ||||||
|  |                     switch (equation) { | ||||||
|  |                     case BlendEquation::Add: | ||||||
|  |                         result = (src_result + dst_result) / 255; | ||||||
|  |                         break; | ||||||
|  | 
 | ||||||
|  |                     case BlendEquation::Subtract: | ||||||
|  |                         result = (src_result - dst_result) / 255; | ||||||
|  |                         break; | ||||||
|  | 
 | ||||||
|  |                     case BlendEquation::ReverseSubtract: | ||||||
|  |                         result = (dst_result - src_result) / 255; | ||||||
|  |                         break; | ||||||
|  | 
 | ||||||
|  |                     // TODO: How do these two actually work?
 | ||||||
|  |                     //       OpenGL doesn't include the blend factors in the min/max computations,
 | ||||||
|  |                     //       but is this what the 3DS actually does?
 | ||||||
|  |                     case BlendEquation::Min: | ||||||
|  |                         result.r() = std::min(src.r(), dest.r()); | ||||||
|  |                         result.g() = std::min(src.g(), dest.g()); | ||||||
|  |                         result.b() = std::min(src.b(), dest.b()); | ||||||
|  |                         result.a() = std::min(src.a(), dest.a()); | ||||||
|  |                         break; | ||||||
|  | 
 | ||||||
|  |                     case BlendEquation::Max: | ||||||
|  |                         result.r() = std::max(src.r(), dest.r()); | ||||||
|  |                         result.g() = std::max(src.g(), dest.g()); | ||||||
|  |                         result.b() = std::max(src.b(), dest.b()); | ||||||
|  |                         result.a() = std::max(src.a(), dest.a()); | ||||||
|  |                         break; | ||||||
|  | 
 | ||||||
|  |                     default: | ||||||
|  |                         LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation); | ||||||
|  |                         UNIMPLEMENTED(); | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|  |                     return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255), | ||||||
|  |                                     MathUtil::Clamp(result.g(), 0, 255), | ||||||
|  |                                     MathUtil::Clamp(result.b(), 0, 255), | ||||||
|  |                                     MathUtil::Clamp(result.a(), 0, 255)); | ||||||
|  |                 }; | ||||||
|  | 
 | ||||||
|                 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb), |                 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb), | ||||||
|                                                LookupFactorA(params.factor_source_a)); |                                                LookupFactorA(params.factor_source_a)); | ||||||
|                 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb), |                 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb), | ||||||
|                                                LookupFactorA(params.factor_dest_a)); |                                                LookupFactorA(params.factor_dest_a)); | ||||||
| 
 | 
 | ||||||
|                 auto src_result = (combiner_output * srcfactor).Cast<int>(); |                 blend_output     = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb); | ||||||
|                 auto dst_result = (dest * dstfactor).Cast<int>(); |                 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a(); | ||||||
| 
 |  | ||||||
|                 switch (params.blend_equation_rgb) { |  | ||||||
|                 case params.Add: |  | ||||||
|                 { |  | ||||||
|                     auto result = (src_result + dst_result) / 255; |  | ||||||
|                     result.r() = std::min(255, result.r()); |  | ||||||
|                     result.g() = std::min(255, result.g()); |  | ||||||
|                     result.b() = std::min(255, result.b()); |  | ||||||
|                     combiner_output = result.Cast<u8>(); |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                  |  | ||||||
|                 case params.Subtract: |  | ||||||
|                 { |  | ||||||
|                     auto result = (src_result - dst_result) / 255; |  | ||||||
|                     result.r() = std::max(0, result.r()); |  | ||||||
|                     result.g() = std::max(0, result.g()); |  | ||||||
|                     result.b() = std::max(0, result.b()); |  | ||||||
|                     combiner_output = result.Cast<u8>(); |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                  |  | ||||||
|                 case params.ReverseSubtract: |  | ||||||
|                 { |  | ||||||
|                     auto result = (dst_result - src_result) / 255; |  | ||||||
|                     result.r() = std::max(0, result.r()); |  | ||||||
|                     result.g() = std::max(0, result.g()); |  | ||||||
|                     result.b() = std::max(0, result.b()); |  | ||||||
|                     combiner_output = result.Cast<u8>(); |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                  |  | ||||||
|                 case params.Min: |  | ||||||
|                 { |  | ||||||
|                     // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
 |  | ||||||
|                     Math::Vec4<int> result; |  | ||||||
|                     result.r() = std::min(combiner_output.r(),dest.r()); |  | ||||||
|                     result.g() = std::min(combiner_output.g(),dest.g()); |  | ||||||
|                     result.b() = std::min(combiner_output.b(),dest.b()); |  | ||||||
|                     combiner_output = result.Cast<u8>(); |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                  |  | ||||||
|                 case params.Max: |  | ||||||
|                 { |  | ||||||
|                     // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
 |  | ||||||
|                     Math::Vec4<int> result; |  | ||||||
|                     result.r() = std::max(combiner_output.r(),dest.r()); |  | ||||||
|                     result.g() = std::max(combiner_output.g(),dest.g()); |  | ||||||
|                     result.b() = std::max(combiner_output.b(),dest.b()); |  | ||||||
|                     combiner_output = result.Cast<u8>(); |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
| 
 |  | ||||||
|                 default: |  | ||||||
|                     LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value()); |  | ||||||
|                     exit(0); |  | ||||||
|                 } |  | ||||||
|             } else { |             } else { | ||||||
|                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op); |                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op); | ||||||
|                 exit(0); |                 UNIMPLEMENTED(); | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             const Math::Vec4<u8> result = { |             const Math::Vec4<u8> result = { | ||||||
|                 registers.output_merger.red_enable   ? combiner_output.r() : dest.r(), |                 registers.output_merger.red_enable   ? blend_output.r() : dest.r(), | ||||||
|                 registers.output_merger.green_enable ? combiner_output.g() : dest.g(), |                 registers.output_merger.green_enable ? blend_output.g() : dest.g(), | ||||||
|                 registers.output_merger.blue_enable  ? combiner_output.b() : dest.b(), |                 registers.output_merger.blue_enable  ? blend_output.b() : dest.b(), | ||||||
|                 registers.output_merger.alpha_enable ? combiner_output.a() : dest.a() |                 registers.output_merger.alpha_enable ? blend_output.a() : dest.a() | ||||||
|             }; |             }; | ||||||
| 
 | 
 | ||||||
|             DrawPixel(x >> 4, y >> 4, result); |             DrawPixel(x >> 4, y >> 4, result); | ||||||
| @ -694,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void ProcessTriangle(const VertexShader::OutputVertex& v0, | ||||||
|  |                      const VertexShader::OutputVertex& v1, | ||||||
|  |                      const VertexShader::OutputVertex& v2) { | ||||||
|  |     ProcessTriangleInternal(v0, v1, v2); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| } // namespace Rasterizer
 | } // namespace Rasterizer
 | ||||||
| 
 | 
 | ||||||
| } // namespace Pica
 | } // namespace Pica
 | ||||||
|  | |||||||
| @ -85,8 +85,11 @@ struct VertexShaderState { | |||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     struct CallStackElement { |     struct CallStackElement { | ||||||
|         u32 final_address; |         u32 final_address;  // Address upon which we jump to return_address
 | ||||||
|         u32 return_address; |         u32 return_address; // Where to jump when leaving scope
 | ||||||
|  |         u8 repeat_counter;  // How often to repeat until this call stack element is removed
 | ||||||
|  |         u8 loop_increment;  // Which value to add to the loop counter after an iteration
 | ||||||
|  |                             // TODO: Should this be a signed value? Does it even matter?
 | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     // TODO: Is there a maximal size for this?
 |     // TODO: Is there a maximal size for this?
 | ||||||
| @ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
| 
 | 
 | ||||||
|     while (true) { |     while (true) { | ||||||
|         if (!state.call_stack.empty()) { |         if (!state.call_stack.empty()) { | ||||||
|             if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) { |             auto& top = state.call_stack.top(); | ||||||
|                 state.program_counter = &shader_memory[state.call_stack.top().return_address]; |             if (state.program_counter - shader_memory.data() == top.final_address) { | ||||||
|  |                 state.address_registers[2] += top.loop_increment; | ||||||
|  | 
 | ||||||
|  |                 if (top.repeat_counter-- == 0) { | ||||||
|  |                     state.program_counter = &shader_memory[top.return_address]; | ||||||
|                     state.call_stack.pop(); |                     state.call_stack.pop(); | ||||||
|  |                 } | ||||||
| 
 | 
 | ||||||
|                 // TODO: Is "trying again" accurate to hardware?
 |                 // TODO: Is "trying again" accurate to hardware?
 | ||||||
|                 continue; |                 continue; | ||||||
| @ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|         const Instruction& instr = *(const Instruction*)state.program_counter; |         const Instruction& instr = *(const Instruction*)state.program_counter; | ||||||
|         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; |         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; | ||||||
| 
 | 
 | ||||||
|         auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) { |         static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, | ||||||
|  |                               u32 return_offset, u8 repeat_count, u8 loop_increment) { | ||||||
|             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
 |             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
 | ||||||
|             state.call_stack.push({ offset + num_instructions, return_offset }); |             state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment }); | ||||||
|         }; |         }; | ||||||
|         u32 binary_offset = state.program_counter - shader_memory.data(); |         u32 binary_offset = state.program_counter - shader_memory.data(); | ||||||
| 
 | 
 | ||||||
| @ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|                 call(state, |                 call(state, | ||||||
|                      instr.flow_control.dest_offset, |                      instr.flow_control.dest_offset, | ||||||
|                      instr.flow_control.num_instructions, |                      instr.flow_control.num_instructions, | ||||||
|                      binary_offset + 1); |                      binary_offset + 1, 0, 0); | ||||||
|                 break; |                 break; | ||||||
| 
 | 
 | ||||||
|             case Instruction::OpCode::CALLU: |             case Instruction::OpCode::CALLU: | ||||||
| @ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|                     call(state, |                     call(state, | ||||||
|                         instr.flow_control.dest_offset, |                         instr.flow_control.dest_offset, | ||||||
|                         instr.flow_control.num_instructions, |                         instr.flow_control.num_instructions, | ||||||
|                         binary_offset + 1); |                         binary_offset + 1, 0, 0); | ||||||
|                 } |                 } | ||||||
|                 break; |                 break; | ||||||
| 
 | 
 | ||||||
| @ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|                     call(state, |                     call(state, | ||||||
|                         instr.flow_control.dest_offset, |                         instr.flow_control.dest_offset, | ||||||
|                         instr.flow_control.num_instructions, |                         instr.flow_control.num_instructions, | ||||||
|                         binary_offset + 1); |                         binary_offset + 1, 0, 0); | ||||||
|                 } |                 } | ||||||
|                 break; |                 break; | ||||||
| 
 | 
 | ||||||
| @ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|                     call(state, |                     call(state, | ||||||
|                          binary_offset + 1, |                          binary_offset + 1, | ||||||
|                          instr.flow_control.dest_offset - binary_offset - 1, |                          instr.flow_control.dest_offset - binary_offset - 1, | ||||||
|                          instr.flow_control.dest_offset + instr.flow_control.num_instructions); |                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); | ||||||
|                 } else { |                 } else { | ||||||
|                     call(state, |                     call(state, | ||||||
|                          instr.flow_control.dest_offset, |                          instr.flow_control.dest_offset, | ||||||
|                          instr.flow_control.num_instructions, |                          instr.flow_control.num_instructions, | ||||||
|                          instr.flow_control.dest_offset + instr.flow_control.num_instructions); |                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|                 break; |                 break; | ||||||
| @ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) { | |||||||
|                     call(state, |                     call(state, | ||||||
|                          binary_offset + 1, |                          binary_offset + 1, | ||||||
|                          instr.flow_control.dest_offset - binary_offset - 1, |                          instr.flow_control.dest_offset - binary_offset - 1, | ||||||
|                          instr.flow_control.dest_offset + instr.flow_control.num_instructions); |                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); | ||||||
|                 } else { |                 } else { | ||||||
|                     call(state, |                     call(state, | ||||||
|                          instr.flow_control.dest_offset, |                          instr.flow_control.dest_offset, | ||||||
|                          instr.flow_control.num_instructions, |                          instr.flow_control.num_instructions, | ||||||
|                          instr.flow_control.dest_offset + instr.flow_control.num_instructions); |                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|                 break; |                 break; | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|  |             case Instruction::OpCode::LOOP: | ||||||
|  |             { | ||||||
|  |                 state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y; | ||||||
|  | 
 | ||||||
|  |                 call(state, | ||||||
|  |                      binary_offset + 1, | ||||||
|  |                      instr.flow_control.dest_offset - binary_offset + 1, | ||||||
|  |                      instr.flow_control.dest_offset + 1, | ||||||
|  |                      shader_uniforms.i[instr.flow_control.int_uniform_id].x, | ||||||
|  |                      shader_uniforms.i[instr.flow_control.int_uniform_id].z); | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|             default: |             default: | ||||||
|                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", |                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", | ||||||
|                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex); |                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex); | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 bunnei
						bunnei