Merge pull request #387 from Subv/maxwell_2d

GPU: Partially implemented the 2D surface copy engine
2025-12-19 22:02:40 +00:00 · 2018-04-25 20:40:17 -04:00
parent d0825c9519 20d86d8a36
commit 42d43ea741
10 changed files with 203 additions and 52 deletions
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -659,6 +659,10 @@ void CopyBlock(const Kernel::Process& process, VAddr dest_addr, VAddr src_addr,
    }
 }
 void CopyBlock(VAddr dest_addr, VAddr src_addr, size_t size) {
    CopyBlock(*Core::CurrentProcess(), dest_addr, src_addr, size);
 }
 boost::optional<PAddr> TryVirtualToPhysicalAddress(const VAddr addr) {
    if (addr == 0) {
        return 0;
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -24,10 +24,7 @@ namespace Tegra {
 enum class BufferMethods {
    BindObject = 0,
-    SetGraphMacroCode = 0x45,
+    CountBufferMethods = 0x40,
    SetGraphMacroCodeArg = 0x46,
    SetGraphMacroEntry = 0x47,
    CountBufferMethods = 0x100,
 };
 void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
@@ -36,28 +33,6 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
                  "{:08X} remaining params {}",
                  method, subchannel, value, remaining_params);
    if (method == static_cast<u32>(BufferMethods::SetGraphMacroEntry)) {
        // Prepare to upload a new macro, reset the upload counter.
        NGLOG_DEBUG(HW_GPU, "Uploading GPU macro {:08X}", value);
        current_macro_entry = value;
        current_macro_code.clear();
        return;
    }
    if (method == static_cast<u32>(BufferMethods::SetGraphMacroCodeArg)) {
        // Append a new code word to the current macro.
        current_macro_code.push_back(value);
        // There are no more params remaining, submit the code to the 3D engine.
        if (remaining_params == 0) {
            maxwell_3d->SubmitMacroCode(current_macro_entry, std::move(current_macro_code));
            current_macro_entry = InvalidGraphMacroEntry;
            current_macro_code.clear();
        }
        return;
    }
    if (method == static_cast<u32>(BufferMethods::BindObject)) {
        // Bind the current subchannel to the desired engine id.
        NGLOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value);
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -2,12 +2,71 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "core/memory.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/textures/decoders.h"
 namespace Tegra {
 namespace Engines {
-void Fermi2D::WriteReg(u32 method, u32 value) {}
+Fermi2D::Fermi2D(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
 void Fermi2D::WriteReg(u32 method, u32 value) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Fermi2D register, increase the size of the Regs structure");
    regs.reg_array[method] = value;
    switch (method) {
    case FERMI2D_REG_INDEX(trigger): {
        HandleSurfaceCopy();
        break;
    }
    }
 }
 void Fermi2D::HandleSurfaceCopy() {
    NGLOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
                  static_cast<u32>(regs.operation));
    const GPUVAddr source = regs.src.Address();
    const GPUVAddr dest = regs.dst.Address();
    // TODO(Subv): Only same-format and same-size copies are allowed for now.
    ASSERT(regs.src.format == regs.dst.format);
    ASSERT(regs.src.width * regs.src.height == regs.dst.width * regs.dst.height);
    // TODO(Subv): Only raw copies are implemented.
    ASSERT(regs.operation == Regs::Operation::SrcCopy);
    const VAddr source_cpu = *memory_manager.GpuToCpuAddress(source);
    const VAddr dest_cpu = *memory_manager.GpuToCpuAddress(dest);
    u32 src_bytes_per_pixel = RenderTargetBytesPerPixel(regs.src.format);
    u32 dst_bytes_per_pixel = RenderTargetBytesPerPixel(regs.dst.format);
    if (regs.src.linear == regs.dst.linear) {
        // If the input layout and the output layout are the same, just perform a raw copy.
        Memory::CopyBlock(dest_cpu, source_cpu,
                          src_bytes_per_pixel * regs.dst.width * regs.dst.height);
        return;
    }
    u8* src_buffer = Memory::GetPointer(source_cpu);
    u8* dst_buffer = Memory::GetPointer(dest_cpu);
    if (!regs.src.linear && regs.dst.linear) {
        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
        Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
                                  dst_bytes_per_pixel, src_buffer, dst_buffer, true,
                                  regs.src.block_height);
    } else {
        // If the input is linear and the output is tiled, swizzle the input and copy it over.
        Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
                                  dst_bytes_per_pixel, dst_buffer, src_buffer, false,
                                  regs.dst.block_height);
    }
 }
 } // namespace Engines
 } // namespace Tegra
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -4,19 +4,106 @@
 #pragma once
 #include <array>
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 namespace Tegra {
 namespace Engines {
 #define FERMI2D_REG_INDEX(field_name)                                                              \
    (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
 class Fermi2D final {
 public:
-    Fermi2D() = default;
+    explicit Fermi2D(MemoryManager& memory_manager);
    ~Fermi2D() = default;
    /// Write the value to the register identified by method.
    void WriteReg(u32 method, u32 value);
    struct Regs {
        static constexpr size_t NUM_REGS = 0x258;
        struct Surface {
            RenderTargetFormat format;
            BitField<0, 1, u32> linear;
            union {
                BitField<0, 4, u32> block_depth;
                BitField<4, 4, u32> block_height;
                BitField<8, 4, u32> block_width;
            };
            u32 depth;
            u32 layer;
            u32 pitch;
            u32 width;
            u32 height;
            u32 address_high;
            u32 address_low;
            GPUVAddr Address() const {
                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                             address_low);
            }
        };
        static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
        enum class Operation : u32 {
            SrcCopyAnd = 0,
            ROPAnd = 1,
            Blend = 2,
            SrcCopy = 3,
            ROP = 4,
            SrcCopyPremult = 5,
            BlendPremult = 6,
        };
        union {
            struct {
                INSERT_PADDING_WORDS(0x80);
                Surface dst;
                INSERT_PADDING_WORDS(2);
                Surface src;
                INSERT_PADDING_WORDS(0x15);
                Operation operation;
                INSERT_PADDING_WORDS(0x9);
                // TODO(Subv): This is only a guess.
                u32 trigger;
                INSERT_PADDING_WORDS(0x1A3);
            };
            std::array<u32, NUM_REGS> reg_array;
        };
    } regs{};
    MemoryManager& memory_manager;
 private:
    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
    void HandleSurfaceCopy();
 };
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
    static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4,                             \
                  "Field " #field_name " has invalid position")
 ASSERT_REG_POSITION(dst, 0x80);
 ASSERT_REG_POSITION(src, 0x8C);
 ASSERT_REG_POSITION(operation, 0xAB);
 ASSERT_REG_POSITION(trigger, 0xB5);
 #undef ASSERT_REG_POSITION
 } // namespace Engines
 } // namespace Tegra
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,10 +22,6 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(MemoryManager& memory_manager)
    : memory_manager(memory_manager), macro_interpreter(*this) {}
 void Maxwell3D::SubmitMacroCode(u32 entry, std::vector<u32> code) {
    uploaded_macros[entry * 2 + MacroRegistersStart] = std::move(code);
 }
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
    auto macro_code = uploaded_macros.find(method);
    // The requested macro must have been uploaded already.
@@ -37,9 +33,6 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
 }
 void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");
    auto debug_context = Core::System::GetInstance().GetGPUDebugContext();
    // It is an error to write to a register other than the current macro's ARG register before it
@@ -68,6 +61,9 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
        return;
    }
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");
    if (debug_context) {
        debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandLoaded, nullptr);
    }
@@ -75,6 +71,10 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
    regs.reg_array[method] = value;
    switch (method) {
    case MAXWELL3D_REG_INDEX(macros.data): {
        ProcessMacroUpload(value);
        break;
    }
    case MAXWELL3D_REG_INDEX(code_address.code_address_high):
    case MAXWELL3D_REG_INDEX(code_address.code_address_low): {
        // Note: For some reason games (like Puyo Puyo Tetris) seem to write 0 to the CODE_ADDRESS
@@ -141,6 +141,12 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
    }
 }
 void Maxwell3D::ProcessMacroUpload(u32 data) {
    // Store the uploaded macro code to interpret them when they're called.
    auto& macro = uploaded_macros[regs.macros.entry * 2 + MacroRegistersStart];
    macro.push_back(data);
 }
 void Maxwell3D::ProcessQueryGet() {
    GPUVAddr sequence_address = regs.query.QueryAddress();
    // Since the sequence address is given as a GPU VAddr, we have to convert it to an application
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -31,7 +31,7 @@ public:
    /// Register structure of the Maxwell3D engine.
    /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
    struct Regs {
-        static constexpr size_t NUM_REGS = 0xE36;
+        static constexpr size_t NUM_REGS = 0xE00;
        static constexpr size_t NumRenderTargets = 8;
        static constexpr size_t NumViewports = 16;
@@ -322,7 +322,15 @@ public:
        union {
            struct {
-                INSERT_PADDING_WORDS(0x200);
+                INSERT_PADDING_WORDS(0x45);
                struct {
                    INSERT_PADDING_WORDS(1);
                    u32 data;
                    u32 entry;
                } macros;
                INSERT_PADDING_WORDS(0x1B8);
                struct {
                    u32 address_high;
@@ -605,7 +613,7 @@ public:
                    u32 size[MaxShaderStage];
                } tex_info_buffers;
-                INSERT_PADDING_WORDS(0x102);
+                INSERT_PADDING_WORDS(0xCC);
            };
            std::array<u32, NUM_REGS> reg_array;
        };
@@ -637,9 +645,6 @@ public:
    /// Write the value to the register identified by method.
    void WriteReg(u32 method, u32 value, u32 remaining_params);
    /// Uploads the code for a GPU macro program associated with the specified entry.
    void SubmitMacroCode(u32 entry, std::vector<u32> code);
    /// Returns a list of enabled textures for the specified shader stage.
    std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
@@ -670,6 +675,9 @@ private:
     */
    void CallMacroMethod(u32 method, std::vector<u32> parameters);
    /// Handles writes to the macro uploading registers.
    void ProcessMacroUpload(u32 data);
    /// Handles a write to the QUERY_GET register.
    void ProcessQueryGet();
@@ -687,6 +695,7 @@ private:
    static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4,                           \
                  "Field " #field_name " has invalid position")
 ASSERT_REG_POSITION(macros, 0x45);
 ASSERT_REG_POSITION(rt, 0x200);
 ASSERT_REG_POSITION(viewport_transform[0], 0x280);
 ASSERT_REG_POSITION(viewport, 0x300);
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -12,7 +12,7 @@ namespace Tegra {
 GPU::GPU() {
    memory_manager = std::make_unique<MemoryManager>();
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(*memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>();
+    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
    maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
 }
@@ -22,4 +22,16 @@ const Tegra::Engines::Maxwell3D& GPU::Get3DEngine() const {
    return *maxwell_3d;
 }
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
    ASSERT(format != RenderTargetFormat::NONE);
    switch (format) {
    case RenderTargetFormat::RGBA8_UNORM:
    case RenderTargetFormat::RGB10_A2_UNORM:
        return 4;
    default:
        UNIMPLEMENTED_MSG("Unimplemented render target format %u", static_cast<u32>(format));
    }
 }
 } // namespace Tegra
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -21,6 +21,9 @@ enum class RenderTargetFormat : u32 {
    RGBA8_SRGB = 0xD6,
 };
 /// Returns the number of bytes per pixel of each rendertarget format.
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
 class DebugContext;
 /**
@@ -86,8 +89,6 @@ public:
    }
 private:
    static constexpr u32 InvalidGraphMacroEntry = 0xFFFFFFFF;
    /// Writes a single register in the engine bound to the specified subchannel
    void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params);
@@ -100,11 +101,6 @@ private:
    std::unique_ptr<Engines::Fermi2D> fermi_2d;
    /// Compute engine
    std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
    /// Entry of the macro that is currently being uploaded
    u32 current_macro_entry = InvalidGraphMacroEntry;
    /// Code being uploaded for the current macro
    std::vector<u32> current_macro_code;
 };
 } // namespace Tegra
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -27,9 +27,8 @@ static u32 GetSwizzleOffset(u32 x, u32 y, u32 image_width, u32 bytes_per_pixel,
    return address;
 }
-static void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
+void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
-                             u8* swizzled_data, u8* unswizzled_data, bool unswizzle,
+                      u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height) {
                             u32 block_height) {
    u8* data_ptrs[2];
    for (unsigned y = 0; y < height; ++y) {
        for (unsigned x = 0; x < width; ++x) {
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -17,6 +17,10 @@ namespace Texture {
 std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
                                 u32 block_height = TICEntry::DefaultBlockHeight);
 /// Copies texture data from a buffer and performs swizzling/unswizzling as necessary.
 void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
                      u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);
 /**
 * Decodes an unswizzled texture into a A8R8G8B8 texture.
 */