From 6df1ab92e0862905fa45fbb161e9afcca79e00b7 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Tue, 6 Jun 2017 22:28:31 -0400 Subject: [PATCH 1/2] Adding Quat_32 rotation format. Refactored and moved a few things to clean up. --- .../acl/algorithm/uniformly_sampled/common.h | 98 ++++++++++++------- .../acl/algorithm/uniformly_sampled/decoder.h | 90 ++++++++--------- .../acl/algorithm/uniformly_sampled/encoder.h | 67 ++++++------- includes/acl/core/algorithm_globals.h | 13 ++- includes/acl/math/scalar_32.h | 16 +++ tools/acl_compressor/sources/main.cpp | 1 + 6 files changed, 166 insertions(+), 119 deletions(-) diff --git a/includes/acl/algorithm/uniformly_sampled/common.h b/includes/acl/algorithm/uniformly_sampled/common.h index b5e38429..48583673 100644 --- a/includes/acl/algorithm/uniformly_sampled/common.h +++ b/includes/acl/algorithm/uniformly_sampled/common.h @@ -33,49 +33,79 @@ namespace acl { namespace uniformly_sampled { - struct FullPrecisionConstants + namespace impl { - static constexpr uint32_t NUM_TRACKS_PER_BONE = 2; - static constexpr uint32_t BITSET_WIDTH = 32; - }; + struct FullPrecisionConstants + { + static constexpr uint32_t NUM_TRACKS_PER_BONE = 2; + static constexpr uint32_t BITSET_WIDTH = 32; + }; - struct FullPrecisionHeader - { - uint16_t num_bones; - RotationFormat8 rotation_format; - uint32_t num_samples; - uint32_t sample_rate; // TODO: Store duration as float instead - uint32_t num_animated_rotation_tracks; // TODO: Calculate from bitsets? - uint32_t num_animated_translation_tracks; // TODO: Calculate from bitsets? + struct FullPrecisionHeader + { + uint16_t num_bones; + RotationFormat8 rotation_format; + uint32_t num_samples; + uint32_t sample_rate; // TODO: Store duration as float instead + uint32_t num_animated_rotation_tracks; // TODO: Calculate from bitsets? + uint32_t num_animated_translation_tracks; // TODO: Calculate from bitsets? - PtrOffset16 default_tracks_bitset_offset; - PtrOffset16 constant_tracks_bitset_offset; - PtrOffset16 constant_track_data_offset; - PtrOffset16 track_data_offset; + PtrOffset16 default_tracks_bitset_offset; + PtrOffset16 constant_tracks_bitset_offset; + PtrOffset16 constant_track_data_offset; + PtrOffset16 track_data_offset; - ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// - uint32_t* get_default_tracks_bitset() { return default_tracks_bitset_offset.add_to(this); } - const uint32_t* get_default_tracks_bitset() const { return default_tracks_bitset_offset.add_to(this); } + uint32_t* get_default_tracks_bitset() { return default_tracks_bitset_offset.add_to(this); } + const uint32_t* get_default_tracks_bitset() const { return default_tracks_bitset_offset.add_to(this); } - uint32_t* get_constant_tracks_bitset() { return constant_tracks_bitset_offset.add_to(this); } - const uint32_t* get_constant_tracks_bitset() const { return constant_tracks_bitset_offset.add_to(this); } + uint32_t* get_constant_tracks_bitset() { return constant_tracks_bitset_offset.add_to(this); } + const uint32_t* get_constant_tracks_bitset() const { return constant_tracks_bitset_offset.add_to(this); } - uint8_t* get_constant_track_data() { return constant_track_data_offset.add_to(this); } - const uint8_t* get_constant_track_data() const { return constant_track_data_offset.add_to(this); } + uint8_t* get_constant_track_data() { return constant_track_data_offset.add_to(this); } + const uint8_t* get_constant_track_data() const { return constant_track_data_offset.add_to(this); } - uint8_t* get_track_data() { return track_data_offset.add_to(this); } - const uint8_t* get_track_data() const { return track_data_offset.add_to(this); } - }; + uint8_t* get_track_data() { return track_data_offset.add_to(this); } + const uint8_t* get_track_data() const { return track_data_offset.add_to(this); } + }; - constexpr FullPrecisionHeader& get_full_precision_header(CompressedClip& clip) - { - return *add_offset_to_ptr(&clip, sizeof(CompressedClip)); - } + constexpr FullPrecisionHeader& get_full_precision_header(CompressedClip& clip) + { + return *add_offset_to_ptr(&clip, sizeof(CompressedClip)); + } - constexpr const FullPrecisionHeader& get_full_precision_header(const CompressedClip& clip) - { - return *add_offset_to_ptr(&clip, sizeof(CompressedClip)); - } + constexpr const FullPrecisionHeader& get_full_precision_header(const CompressedClip& clip) + { + return *add_offset_to_ptr(&clip, sizeof(CompressedClip)); + } + + // TODO: constexpr + inline uint32_t get_rotation_size(RotationFormat8 rotation_format) + { + switch (rotation_format) + { + case RotationFormat8::Quat_128: return sizeof(float) * 4; + case RotationFormat8::Quat_96: return sizeof(float) * 3; + case RotationFormat8::Quat_48: return sizeof(uint16_t) * 3; + case RotationFormat8::Quat_32: return sizeof(uint32_t); + default: + ACL_ENSURE(false, "Invalid or unsupported rotation format: %s", get_rotation_format_name(rotation_format)); + return 0; + } + } + + // TODO: constexpr + inline uint32_t get_translation_size(VectorFormat8 translation_format) + { + switch (translation_format) + { + case VectorFormat8::Vector3_96: return sizeof(float) * 3; + default: + ACL_ENSURE(false, "Invalid or unsupported translation format: %s", get_vector_format_name(translation_format)); + return 0; + } + } + } } } diff --git a/includes/acl/algorithm/uniformly_sampled/decoder.h b/includes/acl/algorithm/uniformly_sampled/decoder.h index 1c522060..ad63ba19 100644 --- a/includes/acl/algorithm/uniformly_sampled/decoder.h +++ b/includes/acl/algorithm/uniformly_sampled/decoder.h @@ -81,25 +81,11 @@ namespace acl inline void initialize_context(const FullPrecisionHeader& header, uint32_t key_frame0, uint32_t key_frame1, DecompressionContext& context) { + uint32_t rotation_size = get_rotation_size(header.rotation_format); + uint32_t translation_size = get_translation_size(VectorFormat8::Vector3_96); + // TODO: No need to store this, unpack from bitset in context and simplify branching logic below? - // TODO: Use a compile time flag to determine the rotation format and avoid a runtime branch - uint32_t animated_pose_size = sizeof(float) * (header.num_animated_translation_tracks * 3); - uint32_t rotation_size = 0; - if (header.rotation_format == RotationFormat8::Quat_128) - { - animated_pose_size += sizeof(float) * (header.num_animated_rotation_tracks * 4); - rotation_size = sizeof(float) * 4; - } - else if (header.rotation_format == RotationFormat8::Quat_96) - { - animated_pose_size += sizeof(float) * (header.num_animated_rotation_tracks * 3); - rotation_size = sizeof(float) * 3; - } - else if (header.rotation_format == RotationFormat8::Quat_48) - { - animated_pose_size += sizeof(uint16_t) * (header.num_animated_rotation_tracks * 3); - rotation_size = sizeof(uint16_t) * 3; - } + uint32_t animated_pose_size = (rotation_size * header.num_animated_rotation_tracks) + (translation_size * header.num_animated_translation_tracks); context.default_tracks_bitset = header.get_default_tracks_bitset(); @@ -192,6 +178,17 @@ namespace acl return quat_from_positive_w(rotation_xyz); } + inline Quat_32 decompress_rotation_quat_32(const uint8_t* data_ptr) + { + // TODO: Always aligned for now because translations are always 12 bytes + const uint32_t* data_ptr_u32 = safe_ptr_cast(data_ptr); + size_t x = *data_ptr_u32 >> 21; + size_t y = (*data_ptr_u32 >> 10) & ((1 << 11) - 1); + size_t z = *data_ptr_u32 & ((1 << 10) - 1); + Vector4_32 rotation_xyz = vector_set(dequantize_signed_normalized(x, 11), dequantize_signed_normalized(y, 11), dequantize_signed_normalized(z, 10)); + return quat_from_positive_w(rotation_xyz); + } + inline Quat_32 decompress_rotation(DecompressionContext& context, RotationFormat8 rotation_format, float interpolation_alpha) { Quat_32 rotation; @@ -210,6 +207,8 @@ namespace acl rotation = decompress_rotation_quat_96(context.constant_track_data); else if (rotation_format == RotationFormat8::Quat_48) rotation = decompress_rotation_quat_48(context.constant_track_data); + else if (rotation_format == RotationFormat8::Quat_32) + rotation = decompress_rotation_quat_32(context.constant_track_data); context.constant_track_data += context.rotation_size; } @@ -234,6 +233,11 @@ namespace acl rotation0 = decompress_rotation_quat_48(context.key_frame_data0); rotation1 = decompress_rotation_quat_48(context.key_frame_data1); } + else if (rotation_format == RotationFormat8::Quat_32) + { + rotation0 = decompress_rotation_quat_32(context.key_frame_data0); + rotation1 = decompress_rotation_quat_32(context.key_frame_data1); + } rotation = quat_lerp(rotation0, rotation1, interpolation_alpha); @@ -247,14 +251,6 @@ namespace acl return rotation; } - template - inline void unaligned_load(const uint8_t* src, DestPtrType* dest, size_t size) - { - uint8_t* dest_u8 = reinterpret_cast(dest); - for (size_t byte_index = 0; byte_index < size; ++byte_index) - dest_u8[byte_index] = src[byte_index]; - } - inline Vector4_32 decompress_translation(DecompressionContext& context, RotationFormat8 rotation_format, float interpolation_alpha) { Vector4_32 translation; @@ -272,9 +268,9 @@ namespace acl else { float* translation_xyz = vector_as_float_ptr(translation); - unaligned_load(context.constant_track_data + (0 * sizeof(float)), translation_xyz + 0, sizeof(float)); - unaligned_load(context.constant_track_data + (1 * sizeof(float)), translation_xyz + 1, sizeof(float)); - unaligned_load(context.constant_track_data + (2 * sizeof(float)), translation_xyz + 2, sizeof(float)); + scalar_unaligned_load(context.constant_track_data + (0 * sizeof(float)), translation_xyz[0]); + scalar_unaligned_load(context.constant_track_data + (1 * sizeof(float)), translation_xyz[1]); + scalar_unaligned_load(context.constant_track_data + (2 * sizeof(float)), translation_xyz[2]); translation_xyz[3] = 0.0f; } @@ -292,15 +288,15 @@ namespace acl else { float* translation0_xyz = vector_as_float_ptr(translation0); - unaligned_load(context.key_frame_data0 + (0 * sizeof(float)), translation0_xyz + 0, sizeof(float)); - unaligned_load(context.key_frame_data0 + (1 * sizeof(float)), translation0_xyz + 1, sizeof(float)); - unaligned_load(context.key_frame_data0 + (2 * sizeof(float)), translation0_xyz + 2, sizeof(float)); + scalar_unaligned_load(context.key_frame_data0 + (0 * sizeof(float)), translation0_xyz[0]); + scalar_unaligned_load(context.key_frame_data0 + (1 * sizeof(float)), translation0_xyz[1]); + scalar_unaligned_load(context.key_frame_data0 + (2 * sizeof(float)), translation0_xyz[2]); translation0_xyz[3] = 0.0f; float* translation1_xyz = vector_as_float_ptr(translation1); - unaligned_load(context.key_frame_data1 + (0 * sizeof(float)), translation1_xyz + 0, sizeof(float)); - unaligned_load(context.key_frame_data1 + (1 * sizeof(float)), translation1_xyz + 1, sizeof(float)); - unaligned_load(context.key_frame_data1 + (2 * sizeof(float)), translation1_xyz + 2, sizeof(float)); + scalar_unaligned_load(context.key_frame_data1 + (0 * sizeof(float)), translation1_xyz[0]); + scalar_unaligned_load(context.key_frame_data1 + (1 * sizeof(float)), translation1_xyz[1]); + scalar_unaligned_load(context.key_frame_data1 + (2 * sizeof(float)), translation1_xyz[2]); translation1_xyz[3] = 0.0f; } @@ -320,6 +316,8 @@ namespace acl template inline void decompress_pose(const CompressedClip& clip, float sample_time, OutputWriterType& writer) { + using namespace impl; + ACL_ENSURE(clip.get_algorithm_type() == AlgorithmType8::UniformlySampled, "Invalid algorithm type [%s], expected [%s]", get_algorithm_name(clip.get_algorithm_type()), get_algorithm_name(AlgorithmType8::UniformlySampled)); ACL_ENSURE(clip.is_valid(false), "Clip is invalid"); @@ -332,21 +330,23 @@ namespace acl float interpolation_alpha; calculate_interpolation_keys(header.num_samples, clip_duration, sample_time, key_frame0, key_frame1, interpolation_alpha); - impl::DecompressionContext context; - impl::initialize_context(header, key_frame0, key_frame1, context); + DecompressionContext context; + initialize_context(header, key_frame0, key_frame1, context); for (uint32_t bone_index = 0; bone_index < header.num_bones; ++bone_index) { - Quat_32 rotation = impl::decompress_rotation(context, header.rotation_format, interpolation_alpha); + Quat_32 rotation = decompress_rotation(context, header.rotation_format, interpolation_alpha); writer.write_bone_rotation(bone_index, rotation); - Vector4_32 translation = impl::decompress_translation(context, header.rotation_format, interpolation_alpha); + Vector4_32 translation = decompress_translation(context, header.rotation_format, interpolation_alpha); writer.write_bone_translation(bone_index, translation); } } inline void decompress_bone(const CompressedClip& clip, float sample_time, uint16_t sample_bone_index, Quat_32* out_rotation, Vector4_32* out_translation) { + using namespace impl; + ACL_ENSURE(clip.get_algorithm_type() == AlgorithmType8::UniformlySampled, "Invalid algorithm type [%s], expected [%s]", get_algorithm_name(clip.get_algorithm_type()), get_algorithm_name(AlgorithmType8::UniformlySampled)); ACL_ENSURE(clip.is_valid(false), "Clip is invalid"); @@ -359,8 +359,8 @@ namespace acl float interpolation_alpha; calculate_interpolation_keys(header.num_samples, clip_duration, sample_time, key_frame0, key_frame1, interpolation_alpha); - impl::DecompressionContext context; - impl::initialize_context(header, key_frame0, key_frame1, context); + DecompressionContext context; + initialize_context(header, key_frame0, key_frame1, context); // TODO: Optimize this by counting the number of bits set, we can use the pop-count instruction on // architectures that support it (e.g. xb1/ps4). This would entirely avoid looping here. @@ -369,15 +369,15 @@ namespace acl if (bone_index == sample_bone_index) break; - impl::skip_rotation(context, header.rotation_format); - impl::skip_translation(context); + skip_rotation(context, header.rotation_format); + skip_translation(context); } - Quat_32 rotation = impl::decompress_rotation(context, header.rotation_format, interpolation_alpha); + Quat_32 rotation = decompress_rotation(context, header.rotation_format, interpolation_alpha); if (out_rotation != nullptr) *out_rotation = rotation; - Vector4_32 translation = impl::decompress_translation(context, header.rotation_format, interpolation_alpha); + Vector4_32 translation = decompress_translation(context, header.rotation_format, interpolation_alpha); if (out_translation != nullptr) *out_translation = translation; } diff --git a/includes/acl/algorithm/uniformly_sampled/encoder.h b/includes/acl/algorithm/uniformly_sampled/encoder.h index 4bffe635..7a19729a 100644 --- a/includes/acl/algorithm/uniformly_sampled/encoder.h +++ b/includes/acl/algorithm/uniformly_sampled/encoder.h @@ -146,26 +146,16 @@ namespace acl return quantize_unsigned_normalized((input * 0.5f) + 0.5f, num_bits); } - template - inline void unaligned_write(const SrcPtrType* src, uint8_t* dest, size_t size) - { - const uint8_t* src_u8 = reinterpret_cast(src); - for (size_t byte_index = 0; byte_index < size; ++byte_index) - dest[byte_index] = src_u8[byte_index]; - } - inline void write_rotation(const FullPrecisionHeader& header, const Quat_32& rotation, uint8_t*& out_rotation_data) { if (header.rotation_format == RotationFormat8::Quat_128) { quat_unaligned_write(rotation, safe_ptr_cast(out_rotation_data)); - out_rotation_data += sizeof(float) * 4; } else if (header.rotation_format == RotationFormat8::Quat_96) { Vector4_32 rotation_xyz = quat_to_vector(quat_ensure_positive_w(rotation)); vector_unaligned_write3(rotation_xyz, safe_ptr_cast(out_rotation_data)); - out_rotation_data += sizeof(float) * 3; } else if (header.rotation_format == RotationFormat8::Quat_48) { @@ -180,9 +170,22 @@ namespace acl data[0] = safe_static_cast(rotation_x); data[1] = safe_static_cast(rotation_y); data[2] = safe_static_cast(rotation_z); + } + else if (header.rotation_format == RotationFormat8::Quat_32) + { + // TODO: Normalize values before quantization, the remaining xyz range isn't [-1.0 .. 1.0]! + Vector4_32 rotation_xyz = quat_to_vector(quat_ensure_positive_w(rotation)); + + size_t rotation_x = quantize_signed_normalized(vector_get_x(rotation_xyz), 11); + size_t rotation_y = quantize_signed_normalized(vector_get_y(rotation_xyz), 11); + size_t rotation_z = quantize_signed_normalized(vector_get_z(rotation_xyz), 10); - out_rotation_data += sizeof(uint16_t) * 3; + // TODO: Always aligned for now because translations are always 12 bytes + uint32_t* data = safe_ptr_cast(out_rotation_data); + *data = safe_static_cast((rotation_x << 21) | (rotation_y << 10) | rotation_z); } + + out_rotation_data += get_rotation_size(header.rotation_format); } inline void write_translation(const FullPrecisionHeader& header, const Vector4_32& translation, uint8_t*& out_translation_data) @@ -195,9 +198,9 @@ namespace acl else { const float* translation_xyz = vector_as_float_ptr(translation); - unaligned_write(translation_xyz + 0, out_translation_data + (0 * sizeof(float)), sizeof(float)); - unaligned_write(translation_xyz + 1, out_translation_data + (1 * sizeof(float)), sizeof(float)); - unaligned_write(translation_xyz + 2, out_translation_data + (2 * sizeof(float)), sizeof(float)); + scalar_unaligned_write(translation_xyz[0], out_translation_data + (0 * sizeof(float))); + scalar_unaligned_write(translation_xyz[1], out_translation_data + (1 * sizeof(float))); + scalar_unaligned_write(translation_xyz[2], out_translation_data + (2 * sizeof(float))); } out_translation_data += sizeof(float) * 3; @@ -268,6 +271,8 @@ namespace acl // Encoder entry point inline CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format) { + using namespace impl; + uint16_t num_bones = clip.get_num_bones(); uint32_t num_samples = clip.get_num_samples(); @@ -275,29 +280,13 @@ namespace acl uint32_t num_constant_translation_tracks; uint32_t num_animated_rotation_tracks; uint32_t num_animated_translation_tracks; - impl::get_num_animated_tracks(clip, num_constant_rotation_tracks, num_constant_translation_tracks, num_animated_rotation_tracks, num_animated_translation_tracks); + get_num_animated_tracks(clip, num_constant_rotation_tracks, num_constant_translation_tracks, num_animated_rotation_tracks, num_animated_translation_tracks); - uint32_t constant_data_size = sizeof(float) * (num_constant_translation_tracks * 3); - uint32_t animated_data_size = sizeof(float) * (num_animated_translation_tracks * 3); + uint32_t rotation_size = get_rotation_size(rotation_format); + uint32_t translation_size = get_translation_size(VectorFormat8::Vector3_96); - switch (rotation_format) - { - case RotationFormat8::Quat_128: - constant_data_size += sizeof(float) * (num_constant_rotation_tracks * 4); - animated_data_size += sizeof(float) * (num_animated_rotation_tracks * 4); - break; - case RotationFormat8::Quat_96: - constant_data_size += sizeof(float) * (num_constant_rotation_tracks * 3); - animated_data_size += sizeof(float) * (num_animated_rotation_tracks * 3); - break; - case RotationFormat8::Quat_48: - constant_data_size += sizeof(uint16_t) * (num_constant_rotation_tracks * 3); - animated_data_size += sizeof(uint16_t) * (num_animated_rotation_tracks * 3); - break; - default: - ACL_ENSURE(false, "Invalid or unsupported rotation format: %s", get_rotation_format_name(rotation_format)); - break; - } + uint32_t constant_data_size = (rotation_size * num_constant_rotation_tracks) + (translation_size * num_constant_translation_tracks); + uint32_t animated_data_size = (rotation_size * num_animated_rotation_tracks) + (translation_size * num_animated_translation_tracks); animated_data_size *= num_samples; @@ -328,10 +317,10 @@ namespace acl header.constant_track_data_offset = header.constant_tracks_bitset_offset + (sizeof(uint32_t) * bitset_size); // Aligned to 4 bytes header.track_data_offset = align_to(header.constant_track_data_offset + constant_data_size, 4); // Aligned to 4 bytes - impl::write_default_track_bitset(header, clip, bitset_size); - impl::write_constant_track_bitset(header, clip, bitset_size); - impl::write_constant_track_data(header, clip, constant_data_size); - impl::write_animated_track_data(header, clip, animated_data_size); + write_default_track_bitset(header, clip, bitset_size); + write_constant_track_bitset(header, clip, bitset_size); + write_constant_track_data(header, clip, constant_data_size); + write_animated_track_data(header, clip, animated_data_size); finalize_compressed_clip(*compressed_clip); diff --git a/includes/acl/core/algorithm_globals.h b/includes/acl/core/algorithm_globals.h index 1ab856fe..47582946 100644 --- a/includes/acl/core/algorithm_globals.h +++ b/includes/acl/core/algorithm_globals.h @@ -51,8 +51,8 @@ namespace acl Quat_128 = 0, // Full precision quaternion, [x,y,z,w] stored with float32 Quat_96 = 1, // Full precision quaternion, [x,y,z] stored with float32 (w is dropped) Quat_48 = 2, // Quantized quaternion, [x,y,z] stored with [16,16,16] bits (w is dropped) + Quat_32 = 3, // Quantized quaternion, [x,y,z] stored with [11,11,10] bits (w is dropped) // TODO: Implement these - //Quat_32, // Quantized quaternion, [x,y,z] stored with [11,11,10] bits (w is dropped) //Quat_Variable, // Quantized quaternion, [x,y,z] stored with [N,N,N] bits (w is dropped, same number of bits per component) //QuatLog_96, // Full precision quaternion logarithm, [x,y,z] stored with float 32 //QuatLog_48, // Quantized quaternion logarithm, [x,y,z] stored with [16,16,16] bits @@ -120,7 +120,18 @@ namespace acl case RotationFormat8::Quat_128: return "Quat 128"; case RotationFormat8::Quat_96: return "Quat 96"; case RotationFormat8::Quat_48: return "Quat 48"; + case RotationFormat8::Quat_32: return "Quat 32"; default: return ""; } } + + // TODO: constexpr + inline const char* get_vector_format_name(VectorFormat8 format) + { + switch (format) + { + case VectorFormat8::Vector3_96: return "Vector3 96"; + default: return ""; + } + } } diff --git a/includes/acl/math/scalar_32.h b/includes/acl/math/scalar_32.h index 107ad6ec..3eb5793e 100644 --- a/includes/acl/math/scalar_32.h +++ b/includes/acl/math/scalar_32.h @@ -73,4 +73,20 @@ namespace acl ACL_ENSURE(SrcIntegralType(input_f) == input, "Convertion to float would result in truncation"); return input_f; } + + inline void scalar_unaligned_load(const uint8_t* src, float& dest) + { + // TODO: Safe with SSE? + uint8_t* dest_u8 = reinterpret_cast(&dest); + for (size_t byte_index = 0; byte_index < sizeof(float); ++byte_index) + dest_u8[byte_index] = src[byte_index]; + } + + inline void scalar_unaligned_write(const float& src, uint8_t* dest) + { + // TODO: Safe with SSE? + const uint8_t* src_u8 = reinterpret_cast(&src); + for (size_t byte_index = 0; byte_index < sizeof(float); ++byte_index) + dest[byte_index] = src_u8[byte_index]; + } } diff --git a/tools/acl_compressor/sources/main.cpp b/tools/acl_compressor/sources/main.cpp index e0c1b075..b0790e66 100644 --- a/tools/acl_compressor/sources/main.cpp +++ b/tools/acl_compressor/sources/main.cpp @@ -405,6 +405,7 @@ int main(int argc, char** argv) try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_128); try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_96); try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_48); + try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_32); } if (IsDebuggerPresent()) From 28abad4076b4d1a6149c9d841185342294fc9f9b Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Wed, 7 Jun 2017 22:17:42 -0400 Subject: [PATCH 2/2] Adding vector format Vector3_48 but cannot be used without range reduction. --- includes/acl/algorithm/ialgorithm.h | 2 +- .../algorithm/uniformly_sampled/algorithm.h | 4 +- .../acl/algorithm/uniformly_sampled/common.h | 4 +- .../acl/algorithm/uniformly_sampled/decoder.h | 70 +++++++------------ .../acl/algorithm/uniformly_sampled/encoder.h | 41 ++++++----- includes/acl/core/algorithm_globals.h | 5 +- includes/acl/math/quat_32.h | 17 +++++ includes/acl/math/vector4_32.h | 16 +++++ tools/acl_compressor/sources/main.cpp | 12 ++-- 9 files changed, 96 insertions(+), 75 deletions(-) diff --git a/includes/acl/algorithm/ialgorithm.h b/includes/acl/algorithm/ialgorithm.h index 8ff92fb3..963d67c5 100644 --- a/includes/acl/algorithm/ialgorithm.h +++ b/includes/acl/algorithm/ialgorithm.h @@ -40,7 +40,7 @@ namespace acl public: virtual ~IAlgorithm() {} - virtual CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format) = 0; + virtual CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format, VectorFormat8 translation_format) = 0; virtual void decompress_pose(const CompressedClip& clip, float sample_time, Transform_32* out_transforms, uint16_t num_transforms) = 0; virtual void decompress_bone(const CompressedClip& clip, float sample_time, uint16_t sample_bone_index, Quat_32* out_rotation, Vector4_32* out_translation) = 0; diff --git a/includes/acl/algorithm/uniformly_sampled/algorithm.h b/includes/acl/algorithm/uniformly_sampled/algorithm.h index 03e176e6..b751016a 100644 --- a/includes/acl/algorithm/uniformly_sampled/algorithm.h +++ b/includes/acl/algorithm/uniformly_sampled/algorithm.h @@ -33,9 +33,9 @@ namespace acl class UniformlySampledAlgorithm final : public IAlgorithm { public: - virtual CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format) override + virtual CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format, VectorFormat8 translation_format) override { - return uniformly_sampled::compress_clip(allocator, clip, skeleton, rotation_format); + return uniformly_sampled::compress_clip(allocator, clip, skeleton, rotation_format, translation_format); } virtual void decompress_pose(const CompressedClip& clip, float sample_time, Transform_32* out_transforms, uint16_t num_transforms) override diff --git a/includes/acl/algorithm/uniformly_sampled/common.h b/includes/acl/algorithm/uniformly_sampled/common.h index 48583673..cb839b9f 100644 --- a/includes/acl/algorithm/uniformly_sampled/common.h +++ b/includes/acl/algorithm/uniformly_sampled/common.h @@ -45,6 +45,7 @@ namespace acl { uint16_t num_bones; RotationFormat8 rotation_format; + VectorFormat8 translation_format; uint32_t num_samples; uint32_t sample_rate; // TODO: Store duration as float instead uint32_t num_animated_rotation_tracks; // TODO: Calculate from bitsets? @@ -100,7 +101,8 @@ namespace acl { switch (translation_format) { - case VectorFormat8::Vector3_96: return sizeof(float) * 3; + case VectorFormat8::Vector3_96: return sizeof(float) * 3; + case VectorFormat8::Vector3_48: return sizeof(uint16_t) * 3; default: ACL_ENSURE(false, "Invalid or unsupported translation format: %s", get_vector_format_name(translation_format)); return 0; diff --git a/includes/acl/algorithm/uniformly_sampled/decoder.h b/includes/acl/algorithm/uniformly_sampled/decoder.h index ad63ba19..96a9352d 100644 --- a/includes/acl/algorithm/uniformly_sampled/decoder.h +++ b/includes/acl/algorithm/uniformly_sampled/decoder.h @@ -73,6 +73,7 @@ namespace acl uint32_t bitset_size; uint32_t rotation_size; + uint32_t translation_size; // Read-write data uint32_t default_track_offset; @@ -82,7 +83,7 @@ namespace acl inline void initialize_context(const FullPrecisionHeader& header, uint32_t key_frame0, uint32_t key_frame1, DecompressionContext& context) { uint32_t rotation_size = get_rotation_size(header.rotation_format); - uint32_t translation_size = get_translation_size(VectorFormat8::Vector3_96); + uint32_t translation_size = get_translation_size(header.translation_format); // TODO: No need to store this, unpack from bitset in context and simplify branching logic below? uint32_t animated_pose_size = (rotation_size * header.num_animated_rotation_tracks) + (translation_size * header.num_animated_translation_tracks); @@ -98,6 +99,7 @@ namespace acl context.bitset_size = ((header.num_bones * FullPrecisionConstants::NUM_TRACKS_PER_BONE) + FullPrecisionConstants::BITSET_WIDTH - 1) / FullPrecisionConstants::BITSET_WIDTH; context.rotation_size = rotation_size; + context.translation_size = translation_size; context.constant_track_offset = 0; context.default_track_offset = 0; @@ -132,12 +134,12 @@ namespace acl bool is_translation_constant = bitset_test(context.constant_tracks_bitset, context.bitset_size, context.constant_track_offset); if (is_translation_constant) { - context.constant_track_data += sizeof(float) * 3; + context.constant_track_data += context.translation_size; } else { - context.key_frame_data0 += sizeof(float) * 3; - context.key_frame_data1 += sizeof(float) * 3; + context.key_frame_data0 += context.translation_size; + context.key_frame_data1 += context.translation_size; } } @@ -147,12 +149,12 @@ namespace acl inline Quat_32 decompress_rotation_quat_128(const uint8_t* data_ptr) { - return quat_unaligned_load(safe_ptr_cast(data_ptr)); + return quat_unaligned_load(data_ptr); } inline Quat_32 decompress_rotation_quat_96(const uint8_t* data_ptr) { - Vector4_32 rotation_xyz = vector_unaligned_load3(safe_ptr_cast(data_ptr)); + Vector4_32 rotation_xyz = vector_unaligned_load3(data_ptr); return quat_from_positive_w(rotation_xyz); } @@ -180,11 +182,12 @@ namespace acl inline Quat_32 decompress_rotation_quat_32(const uint8_t* data_ptr) { - // TODO: Always aligned for now because translations are always 12 bytes - const uint32_t* data_ptr_u32 = safe_ptr_cast(data_ptr); - size_t x = *data_ptr_u32 >> 21; - size_t y = (*data_ptr_u32 >> 10) & ((1 << 11) - 1); - size_t z = *data_ptr_u32 & ((1 << 10) - 1); + // Read 2 bytes at a time to ensure safe alignment + const uint16_t* data_ptr_u16 = safe_ptr_cast(data_ptr); + uint32_t rotation_u32 = (safe_static_cast(data_ptr_u16[0]) << 16) | safe_static_cast(data_ptr_u16[1]); + size_t x = rotation_u32 >> 21; + size_t y = (rotation_u32 >> 10) & ((1 << 11) - 1); + size_t z = rotation_u32 & ((1 << 10) - 1); Vector4_32 rotation_xyz = vector_set(dequantize_signed_normalized(x, 11), dequantize_signed_normalized(y, 11), dequantize_signed_normalized(z, 10)); return quat_from_positive_w(rotation_xyz); } @@ -194,7 +197,9 @@ namespace acl Quat_32 rotation; bool is_rotation_default = bitset_test(context.default_tracks_bitset, context.bitset_size, context.default_track_offset); if (is_rotation_default) + { rotation = quat_identity_32(); + } else { bool is_rotation_constant = bitset_test(context.constant_tracks_bitset, context.bitset_size, context.constant_track_offset); @@ -256,54 +261,27 @@ namespace acl Vector4_32 translation; bool is_translation_default = bitset_test(context.default_tracks_bitset, context.bitset_size, context.default_track_offset); if (is_translation_default) + { translation = vector_zero_32(); + } else { bool are_translations_always_aligned = rotation_format != RotationFormat8::Quat_48; bool is_translation_constant = bitset_test(context.constant_tracks_bitset, context.bitset_size, context.constant_track_offset); if (is_translation_constant) { - if (are_translations_always_aligned) - translation = vector_unaligned_load3(safe_ptr_cast(context.constant_track_data)); - else - { - float* translation_xyz = vector_as_float_ptr(translation); - scalar_unaligned_load(context.constant_track_data + (0 * sizeof(float)), translation_xyz[0]); - scalar_unaligned_load(context.constant_track_data + (1 * sizeof(float)), translation_xyz[1]); - scalar_unaligned_load(context.constant_track_data + (2 * sizeof(float)), translation_xyz[2]); - translation_xyz[3] = 0.0f; - } - - context.constant_track_data += sizeof(float) * 3; + translation = vector_unaligned_load3(context.constant_track_data); + context.constant_track_data += context.translation_size; } else { - Vector4_32 translation0; - Vector4_32 translation1; - if (are_translations_always_aligned) - { - translation0 = vector_unaligned_load3(safe_ptr_cast(context.key_frame_data0)); - translation1 = vector_unaligned_load3(safe_ptr_cast(context.key_frame_data1)); - } - else - { - float* translation0_xyz = vector_as_float_ptr(translation0); - scalar_unaligned_load(context.key_frame_data0 + (0 * sizeof(float)), translation0_xyz[0]); - scalar_unaligned_load(context.key_frame_data0 + (1 * sizeof(float)), translation0_xyz[1]); - scalar_unaligned_load(context.key_frame_data0 + (2 * sizeof(float)), translation0_xyz[2]); - translation0_xyz[3] = 0.0f; - - float* translation1_xyz = vector_as_float_ptr(translation1); - scalar_unaligned_load(context.key_frame_data1 + (0 * sizeof(float)), translation1_xyz[0]); - scalar_unaligned_load(context.key_frame_data1 + (1 * sizeof(float)), translation1_xyz[1]); - scalar_unaligned_load(context.key_frame_data1 + (2 * sizeof(float)), translation1_xyz[2]); - translation1_xyz[3] = 0.0f; - } + Vector4_32 translation0 = vector_unaligned_load3(context.key_frame_data0); + Vector4_32 translation1 = vector_unaligned_load3(context.key_frame_data1); translation = vector_lerp(translation0, translation1, interpolation_alpha); - context.key_frame_data0 += sizeof(float) * 3; - context.key_frame_data1 += sizeof(float) * 3; + context.key_frame_data0 += context.translation_size; + context.key_frame_data1 += context.translation_size; } } diff --git a/includes/acl/algorithm/uniformly_sampled/encoder.h b/includes/acl/algorithm/uniformly_sampled/encoder.h index 7a19729a..951e7328 100644 --- a/includes/acl/algorithm/uniformly_sampled/encoder.h +++ b/includes/acl/algorithm/uniformly_sampled/encoder.h @@ -150,16 +150,15 @@ namespace acl { if (header.rotation_format == RotationFormat8::Quat_128) { - quat_unaligned_write(rotation, safe_ptr_cast(out_rotation_data)); + quat_unaligned_write(rotation, out_rotation_data); } else if (header.rotation_format == RotationFormat8::Quat_96) { Vector4_32 rotation_xyz = quat_to_vector(quat_ensure_positive_w(rotation)); - vector_unaligned_write3(rotation_xyz, safe_ptr_cast(out_rotation_data)); + vector_unaligned_write3(rotation_xyz, out_rotation_data); } else if (header.rotation_format == RotationFormat8::Quat_48) { - // TODO: Normalize values before quantization, the remaining xyz range isn't [-1.0 .. 1.0]! Vector4_32 rotation_xyz = quat_to_vector(quat_ensure_positive_w(rotation)); size_t rotation_x = quantize_signed_normalized(vector_get_x(rotation_xyz), 16); @@ -173,16 +172,18 @@ namespace acl } else if (header.rotation_format == RotationFormat8::Quat_32) { - // TODO: Normalize values before quantization, the remaining xyz range isn't [-1.0 .. 1.0]! Vector4_32 rotation_xyz = quat_to_vector(quat_ensure_positive_w(rotation)); size_t rotation_x = quantize_signed_normalized(vector_get_x(rotation_xyz), 11); size_t rotation_y = quantize_signed_normalized(vector_get_y(rotation_xyz), 11); size_t rotation_z = quantize_signed_normalized(vector_get_z(rotation_xyz), 10); - // TODO: Always aligned for now because translations are always 12 bytes - uint32_t* data = safe_ptr_cast(out_rotation_data); - *data = safe_static_cast((rotation_x << 21) | (rotation_y << 10) | rotation_z); + uint32_t rotation_u32 = safe_static_cast((rotation_x << 21) | (rotation_y << 10) | rotation_z); + + // Written 2 bytes at a time to ensure safe alignment + uint16_t* data = safe_ptr_cast(out_rotation_data); + data[0] = safe_static_cast(rotation_u32 >> 16); + data[1] = safe_static_cast(rotation_u32 & 0xFFFF); } out_rotation_data += get_rotation_size(header.rotation_format); @@ -190,20 +191,23 @@ namespace acl inline void write_translation(const FullPrecisionHeader& header, const Vector4_32& translation, uint8_t*& out_translation_data) { - bool are_translations_always_aligned = header.rotation_format != RotationFormat8::Quat_48; - if (are_translations_always_aligned) + if (header.translation_format == VectorFormat8::Vector3_96) { - vector_unaligned_write3(translation, safe_ptr_cast(out_translation_data)); + vector_unaligned_write3(translation, out_translation_data); } - else + else if (header.translation_format == VectorFormat8::Vector3_48) { - const float* translation_xyz = vector_as_float_ptr(translation); - scalar_unaligned_write(translation_xyz[0], out_translation_data + (0 * sizeof(float))); - scalar_unaligned_write(translation_xyz[1], out_translation_data + (1 * sizeof(float))); - scalar_unaligned_write(translation_xyz[2], out_translation_data + (2 * sizeof(float))); + size_t translation_x = quantize_signed_normalized(vector_get_x(translation), 16); + size_t translation_y = quantize_signed_normalized(vector_get_y(translation), 16); + size_t translation_z = quantize_signed_normalized(vector_get_z(translation), 16); + + uint16_t* data = safe_ptr_cast(out_translation_data); + data[0] = safe_static_cast(translation_x); + data[1] = safe_static_cast(translation_y); + data[2] = safe_static_cast(translation_z); } - out_translation_data += sizeof(float) * 3; + out_translation_data += get_translation_size(header.translation_format); } inline void write_constant_track_data(FullPrecisionHeader& header, const AnimationClip& clip, uint32_t constant_data_size) @@ -269,7 +273,7 @@ namespace acl } // Encoder entry point - inline CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format) + inline CompressedClip* compress_clip(Allocator& allocator, const AnimationClip& clip, const RigidSkeleton& skeleton, RotationFormat8 rotation_format, VectorFormat8 translation_format) { using namespace impl; @@ -283,7 +287,7 @@ namespace acl get_num_animated_tracks(clip, num_constant_rotation_tracks, num_constant_translation_tracks, num_animated_rotation_tracks, num_animated_translation_tracks); uint32_t rotation_size = get_rotation_size(rotation_format); - uint32_t translation_size = get_translation_size(VectorFormat8::Vector3_96); + uint32_t translation_size = get_translation_size(translation_format); uint32_t constant_data_size = (rotation_size * num_constant_rotation_tracks) + (translation_size * num_constant_translation_tracks); uint32_t animated_data_size = (rotation_size * num_animated_rotation_tracks) + (translation_size * num_animated_translation_tracks); @@ -308,6 +312,7 @@ namespace acl FullPrecisionHeader& header = get_full_precision_header(*compressed_clip); header.num_bones = num_bones; header.rotation_format = rotation_format; + header.translation_format = translation_format; header.num_samples = num_samples; header.sample_rate = clip.get_sample_rate(); header.num_animated_rotation_tracks = num_animated_rotation_tracks; diff --git a/includes/acl/core/algorithm_globals.h b/includes/acl/core/algorithm_globals.h index 47582946..f5300dc6 100644 --- a/includes/acl/core/algorithm_globals.h +++ b/includes/acl/core/algorithm_globals.h @@ -53,6 +53,7 @@ namespace acl Quat_48 = 2, // Quantized quaternion, [x,y,z] stored with [16,16,16] bits (w is dropped) Quat_32 = 3, // Quantized quaternion, [x,y,z] stored with [11,11,10] bits (w is dropped) // TODO: Implement these + //Quat_32_Largest, // Quantized quaternion, [?,?,?] stored with [10,10,10] bits (largest component is dropped, component index stored on 2 bits) //Quat_Variable, // Quantized quaternion, [x,y,z] stored with [N,N,N] bits (w is dropped, same number of bits per component) //QuatLog_96, // Full precision quaternion logarithm, [x,y,z] stored with float 32 //QuatLog_48, // Quantized quaternion logarithm, [x,y,z] stored with [16,16,16] bits @@ -66,8 +67,9 @@ namespace acl enum class VectorFormat8 : uint8_t { Vector3_96 = 0, // Full precision vector3, [x,y,z] stored with float32 + Vector3_48 = 1, // Quantized vector3, [x,y,z] stored with [16,16,16] bits // TODO: Implement these - //Vector3_48, // Quantized vector3, [x,y,z] stored with [16,16,16] bits + //Vector3_32, // Quantized vector3, [x,y,z] stored with [11,11,10] bits //Vector3_Variable, // Quantized vector3, [x,y,z] stored with [N,N,N] bits (same number of bits per component) }; @@ -131,6 +133,7 @@ namespace acl switch (format) { case VectorFormat8::Vector3_96: return "Vector3 96"; + case VectorFormat8::Vector3_48: return "Vector3 48"; default: return ""; } } diff --git a/includes/acl/math/quat_32.h b/includes/acl/math/quat_32.h index 52f69677..5e8e9390 100644 --- a/includes/acl/math/quat_32.h +++ b/includes/acl/math/quat_32.h @@ -47,6 +47,13 @@ namespace acl return quat_set(input[0], input[1], input[2], input[3]); } + inline Quat_32 quat_unaligned_load(const uint8_t* input) + { + // TODO: Cross platform unaligned read needs to be safe + const float* input_f = reinterpret_cast(input); + return quat_set(input_f[0], input_f[1], input_f[2], input_f[3]); + } + inline Quat_32 quat_identity_32() { return quat_set(0.0f, 0.0f, 0.0f, 1.0f); @@ -115,6 +122,16 @@ namespace acl output[3] = quat_get_w(input); } + inline void quat_unaligned_write(const Quat_32& input, uint8_t* output) + { + // TODO: Cross platform unaligned write needs to be safe + float* output_f = reinterpret_cast(output); + output_f[0] = quat_get_x(input); + output_f[1] = quat_get_y(input); + output_f[2] = quat_get_z(input); + output_f[3] = quat_get_w(input); + } + inline Quat_32 quat_conjugate(const Quat_32& input) { return quat_set(-quat_get_x(input), -quat_get_y(input), -quat_get_z(input), quat_get_w(input)); diff --git a/includes/acl/math/vector4_32.h b/includes/acl/math/vector4_32.h index a7bc14ec..270131a8 100644 --- a/includes/acl/math/vector4_32.h +++ b/includes/acl/math/vector4_32.h @@ -69,6 +69,13 @@ namespace acl return vector_set(input[0], input[1], input[2], 0.0f); } + inline Vector4_32 vector_unaligned_load3(const uint8_t* input) + { + // TODO: Cross platform unaligned read needs to be safe + const float* input_f = reinterpret_cast(input); + return vector_set(input_f[0], input_f[1], input_f[2], 0.0f); + } + inline Vector4_32 vector_zero_32() { return vector_set(0.0f, 0.0f, 0.0f, 0.0f); @@ -155,6 +162,15 @@ namespace acl output[2] = vector_get_z(input); } + inline void vector_unaligned_write3(const Vector4_32& input, uint8_t* output) + { + // TODO: Cross platform unaligned write needs to be safe + float* output_f = reinterpret_cast(output); + output_f[0] = vector_get_x(input); + output_f[1] = vector_get_y(input); + output_f[2] = vector_get_z(input); + } + inline Vector4_32 vector_add(const Vector4_32& lhs, const Vector4_32& rhs) { #if defined(ACL_SSE2_INTRINSICS) diff --git a/tools/acl_compressor/sources/main.cpp b/tools/acl_compressor/sources/main.cpp index b0790e66..aa0f0675 100644 --- a/tools/acl_compressor/sources/main.cpp +++ b/tools/acl_compressor/sources/main.cpp @@ -333,7 +333,7 @@ static double find_max_error(acl::Allocator& allocator, const acl::AnimationClip } template -static void try_algorithm(const Options& options, acl::Allocator& allocator, const acl::AnimationClip& clip, const acl::RigidSkeleton& skeleton, acl::RotationFormat8 rotation_format) +static void try_algorithm(const Options& options, acl::Allocator& allocator, const acl::AnimationClip& clip, const acl::RigidSkeleton& skeleton, acl::RotationFormat8 rotation_format, acl::VectorFormat8 translation_format) { using namespace acl; @@ -341,7 +341,7 @@ static void try_algorithm(const Options& options, acl::Allocator& allocator, con QueryPerformanceCounter(&start_time_cycles); AlgorithmType8 algorithm; - CompressedClip* compressed_clip = algorithm.compress_clip(allocator, clip, skeleton, rotation_format); + CompressedClip* compressed_clip = algorithm.compress_clip(allocator, clip, skeleton, rotation_format, translation_format); LARGE_INTEGER end_time_cycles; QueryPerformanceCounter(&end_time_cycles); @@ -402,10 +402,10 @@ int main(int argc, char** argv) // Compress & Decompress { - try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_128); - try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_96); - try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_48); - try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_32); + try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_128, acl::VectorFormat8::Vector3_96); + try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_96, acl::VectorFormat8::Vector3_96); + try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_48, acl::VectorFormat8::Vector3_96); + try_algorithm(options, allocator, *clip.get(), *skeleton.get(), RotationFormat8::Quat_32, acl::VectorFormat8::Vector3_96); } if (IsDebuggerPresent())